# IT1244 Project

## Import Libraries

In [3]:
import re as re
import heapq as heapq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import random as random

### Import Bert Encoders

In [4]:
import torch as torch
from transformers import BertModel
from transformers import BertTokenizer, BertTokenizerFast

### Testing Bert Encoding

In [5]:

# Load tokenizer and model
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Encode sentences
sentence1 = "This is an example sentence with change negative."
inputs1 = tokenizer(sentence1, return_tensors="pt")

sentence2 = "This is an example sentence with change upset."
inputs2 = tokenizer(sentence2, return_tensors="pt")

sentence3 = "This is an example sentence with change happy."
inputs3 = tokenizer(sentence3, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)
    outputs3 = model(**inputs3)

# Extract [CLS] token embedding (sentence-level representation)
Vector_X = outputs1.last_hidden_state[:, 0, :]
Vector_Y = outputs2.last_hidden_state[:, 0, :]
Vector_Z = outputs3.last_hidden_state[:, 0, :]

In [6]:
float(cosine_similarity(Vector_X, Vector_Y)[0][0])

0.9878990054130554

In [7]:
float(cosine_similarity(Vector_X, Vector_Z)[0][0])

0.982489287853241

In [8]:
def bert_encode(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state[:, 0, :]


We can see that the closer concepts are have higher cosine similiarity

## Data Importing

In [9]:
tweets = pd.read_csv("../Data/Raw/Tweets.csv")
tweets.head()

Unnamed: 0,airline_sentiment,sentiment_confidence,text
0,neutral,1.0,@VirginAmerica What @dhepburn said.
1,positive,0.3486,@VirginAmerica plus you've added commercials t...
2,neutral,0.6837,@VirginAmerica I didn't today... Must mean I n...
3,negative,1.0,@VirginAmerica it's really aggressive to blast...
4,negative,1.0,@VirginAmerica and it's a really big bad thing...


### Code starts here

In [10]:
tweets.shape


(14639, 3)

In [11]:
tweets["text"][1]

"@VirginAmerica plus you've added commercials to the experience... tacky."

In [12]:
def at_filter(text):
    return re.sub(r"@\w+", "", text)

In [13]:
def alpha_filter(text):
    return re.sub(r'[^A-Za-z ]', '', text)

## Data Cleaning

In [14]:
tweets["text"] = tweets["text"].apply(lambda x: at_filter(x))


In [15]:
tweets["text"] = tweets["text"].apply(lambda x: alpha_filter(x))

In [16]:
tweets

Unnamed: 0,airline_sentiment,sentiment_confidence,text
0,neutral,1.0000,What said
1,positive,0.3486,plus youve added commercials to the experienc...
2,neutral,0.6837,I didnt today Must mean I need to take anothe...
3,negative,1.0000,its really aggressive to blast obnoxious ente...
4,negative,1.0000,and its a really big bad thing about it
...,...,...,...
14634,positive,0.3487,thank you we got on a different flight to Chi...
14635,negative,1.0000,leaving over minutes Late Flight No warnings...
14636,neutral,1.0000,Please bring American Airlines to BlackBerry
14637,negative,1.0000,you have my money you change my flight and do...


In [17]:
X_train, X_test, y_train, y_test = train_test_split(tweets["text"], tweets["airline_sentiment"], test_size=0.2, random_state=42)

In [18]:
tweets.to_csv("../Data/Cleaned/CleanTweets.csv")

In [19]:
max(list(map(lambda x: len(x), tweets["text"])))

166

## Data transformation

In [21]:
sentence_vectors = tweets["text"].apply(lambda x: bert_encode(x))

In [82]:
data = np.array(list(map(lambda x: x[0], sentence_vectors)))

#### Store Data as a CSV so don't need to re-encode every time

In [84]:
import pyarrow as pa
import pyarrow.parquet as pq

In [85]:
data_frame = pd.DataFrame(data)
#data_frame.to_txt("../Data/Cleaned/BERT_Vectors.csv")

In [87]:
table = pa.Table.from_pandas(data_frame)

In [95]:
pq.write_table(table, "../Data/Cleaned/BERT_Vectors.oarquet")

In [96]:
Bert_Encoded_Text = pd.read_parquet("../Data/Cleaned/BERT_Vectors.oarquet")


In [100]:
data = Bert_Encoded_Text.to_numpy()

In [102]:
Bert_Encoded_Text.to_numpy()

array([[-0.06095307,  0.08171247, -0.3694084 , ..., -0.12956254,
         0.2612489 ,  0.22729596],
       [ 0.05729439,  0.32379445, -0.21796885, ..., -0.41243485,
         0.6598717 , -0.029063  ],
       [ 0.07487375,  0.62892705, -0.1277031 , ..., -0.20496681,
         0.44621164,  0.35458577],
       ...,
       [ 0.00215415,  0.03963464, -0.02555776, ..., -0.23670602,
        -0.03042183,  0.27644476],
       [-0.08850451,  0.41939262, -0.14636844, ..., -0.63478065,
         0.12714297,  0.39889377],
       [-0.00421663, -0.09909241,  0.34751567, ..., -0.12137473,
         0.448606  ,  0.10963394]], shape=(14639, 768), dtype=float32)

In [101]:
data.shape

(14639, 768)

### K-means using personal implementation

In [25]:
def cos_distance(x1, x2):
    '''
    x1: numpy array, shape = [D]
    x2: numpy array, shape = [D]
    RETURN
        dist: float value
    '''
    #dist = np.sqrt(np.dot(x1-x2,x1-x2))
    x1_length = np.dot(x1, x1)
    x2_length = np.dot(x2, x2)
    cosine = np.dot(x1, x2)/(x1_length*x2_length)
    dist = np.arccos(cosine)/np.arccos(-1)

    ## end
    return dist

In [27]:
def closestCentroid(input_vector, centroid_dict):
    '''
    coordinates_x: numpy array, shape = [D]
    coordinates_centroid: dictionary, key = int, value = numpy array of shape [D]
    RETURN
        closest_centroid: int value
    '''
    closest_centroid = None

    ## start your code here
    smallest_dist = 10**10

    for key in centroid_dict:
        curr_centroid_vector = centroid_dict[key]
        curr_dist = cos_distance(curr_centroid_vector, input_vector)
        
        if (curr_dist < smallest_dist):
            closest_centroid = key
            smallest_dist = curr_dist
    
    ## end
    return closest_centroid

When using arccos(cosine_similarity)/π as the distance metric, the standard arithmetic mean used in k-means won’t work correctly because it doesn’t preserve unit directionality. 

Instead, we need to compute the centroid in a way that maintains the directional nature of the embeddings. 

The best approach is to use the normalized mean vector.

In [28]:
def compute_new_centroid(cluster_vectors):
    if len(cluster_vectors) == 0:
        return None  # Handle empty clusters
    
    cluster_vectors = [vec / np.linalg.norm(vec) for vec in cluster_vectors] 
    
    mean_vector = np.mean(cluster_vectors, axis=0).reshape(-1)  # Step 1: Compute the mean vector
    norm = np.linalg.norm(mean_vector)  # Step 2: Compute its norm
    
    if norm == 0:
        return np.zeros_like(mean_vector)  # Edge case: if the norm is 0, return a zero vector
    
    return mean_vector / norm  # Step 3: Normalize to get unit vector

In [29]:
def KMeansClustering(X, index_centroids, k, n):
    '''
    X: numpy array, shape = [N, D]
    index_centroids: list, shape = k
    k: int value
    n: int value
    RETURN
        repartition: dictionary, key = int, value = numpy array of shape [number of points in cluster, D]
        coordinates: dictionary, key = int, value = numpy array of shape [D]
    '''
    repartition, centroids,  = None, dict()
    ## start your code here
    # Initialise your first centroids
    for i in range(k):
        centroids[i] = X[index_centroids[i]]
        
    # Define stopping criterion
    for i in range(n):
        # Initialise new dictionaries for repartition and coordinates
        repartition = dict()
        for i in range(k):
            repartition[i] = []
            
        # Assign all the points to the closest cluster centroid
        for vector in X:
            repartition[closestCentroid(vector, centroids)].append(vector)

        # Recompute the new centroids of the newly formed clusters
        for cluster_key in repartition:
            curr_cluster = repartition[cluster_key]
            new_centroid = compute_new_centroid(curr_cluster)
            centroids[cluster_key] = new_centroid
        
    
    ## end
    return repartition, centroids

### K-Means using sklearn

In [30]:
def sklearnKmeans(X, k, m):
    '''
    X: numpy array, shape = [N, D]
    k: int value
    m: int value
    RETURN
        position: numpy array, shape = [N]
        centers: numpy array, shape = [k, D]
    '''
    position, centers = None, None
    ## start your code here
    kmeans = KMeans(n_clusters=k,n_init=1, max_iter=m).fit(X)
    position, centers = kmeans.predict(X), kmeans.cluster_centers_
    ## end
    return position, centers

## Modelling the Data

In [None]:
def euclideanDist(x1, x2):
    '''
    x1: numpy array, shape = [D]
    x2: numpy array, shape = [D]
    RETURN
        dist: float value
    '''
    dist = np.sqrt(np.dot(x1-x2,x1-x2))
    ## start your code here
    
    
    ## end
    return dist

### Initial attempt using SK-Learn Library

In [None]:
position, centers = sklearnKmeans(data, 3, 10000)

In [None]:
clusters = {0:[], 1:[], 2:[]}
N = len(position)

for index in range(N):
    clusters[position[index]].append(data[index])

In [None]:
def closest_to_Centroid(clusters, centroids):

    cluster_representatives = dict()
    for cluster in clusters.keys():
        cluster_representatives[cluster] = []

    for cluster in clusters.keys():
        curr_centroid = centroids[cluster]
        smallest_dist = 10**10
        closest_datapoint = clusters[cluster][0]

        for vector in clusters[cluster]:
            curr_dist = euclideanDist(vector, curr_centroid)
            if (curr_dist < smallest_dist):
                closest_datapoint = vector
                smallest_dist = curr_dist
        
        cluster_representatives[cluster] = closest_datapoint
        
    ## end

    return cluster_representatives

In [None]:
cluster_representatives = closest_to_Centroid(clusters, centers)
cluster_rep_indexes = [np.where(data == rep)[0][0] for rep in cluster_representatives.values()]

In [None]:
# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]
labels = dict()

for cluster in possible_clusters:
    labels[cluster] = list(tweets["airline_sentiment"][cluster_rep_indexes])[cluster] 

In [None]:
labels

{0: 'negative', 1: 'positive', 2: 'neutral'}

In [None]:
predicted_sentiments = list(map(lambda x: labels[x], position))

In [None]:
accuracy = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments)
print(f"accuracy is {np.round(accuracy*100, 2)}%")

accuracy is 39.9%


Accuracy achieved in first attempt is ≈ 40%

### Second Attempt using own K-Means Clustering algorithm

In [31]:
index_centroids = [np.array([random.randint(0, len(data) - 1)]) for i in range(3)]

In [32]:
clusters2, centers = KMeansClustering(data, index_centroids, 3, 10000)

ValueError: shapes (1,768) and (1,768) not aligned: 768 (dim 1) != 1 (dim 0)