# IT1244 Project

## Import Libraries

In [1]:
import re as re
import heapq as heapq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import random as random
from sklearn.preprocessing import StandardScaler
import pyarrow as pa
import pyarrow.parquet as pq
from sklearn.decomposition import PCA

### Import Bert Encoders

In [2]:
import torch as torch
from transformers import BertModel
from transformers import BertTokenizer, BertTokenizerFast

### Testing Bert Encoding

In [3]:

# Load tokenizer and model
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Encode sentences
sentence1 = "This is an example sentence with change negative."
inputs1 = tokenizer(sentence1, return_tensors="pt")

sentence2 = "This is an example sentence with change upset."
inputs2 = tokenizer(sentence2, return_tensors="pt")

sentence3 = "This is an example sentence with change happy."
inputs3 = tokenizer(sentence3, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)
    outputs3 = model(**inputs3)

# Extract [CLS] token embedding (sentence-level representation)
Vector_X = outputs1.last_hidden_state[:, 0, :]
Vector_Y = outputs2.last_hidden_state[:, 0, :]
Vector_Z = outputs3.last_hidden_state[:, 0, :]

In [4]:
print(float(cosine_similarity(Vector_X, Vector_Y)[0][0]))
print(float(cosine_similarity(Vector_X, Vector_Z)[0][0]))

0.9878990054130554
0.982489287853241


In [5]:
def bert_encode(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state[:, 0, :]


We can see that the closer concepts are have higher cosine similiarity

## Data Importing

In [6]:
tweets = pd.read_csv("../Data/Raw/Tweets.csv")
tweets.head()

Unnamed: 0,airline_sentiment,sentiment_confidence,text
0,neutral,1.0,@VirginAmerica What @dhepburn said.
1,positive,0.3486,@VirginAmerica plus you've added commercials t...
2,neutral,0.6837,@VirginAmerica I didn't today... Must mean I n...
3,negative,1.0,@VirginAmerica it's really aggressive to blast...
4,negative,1.0,@VirginAmerica and it's a really big bad thing...


In [7]:
tweets.shape

(14639, 3)

In [8]:
tweets["text"][1]

"@VirginAmerica plus you've added commercials to the experience... tacky."

### Code starts here

## Data Cleaning

In [9]:
def at_filter(text):
    return re.sub(r"@\w+", "", text)

def alpha_filter(text):
    return re.sub(r'[^A-Za-z ]', '', text)

In [10]:
tweets["text"] = tweets["text"].apply(lambda x: at_filter(x))
tweets["text"] = tweets["text"].apply(lambda x: alpha_filter(x))

In [11]:
tweets

Unnamed: 0,airline_sentiment,sentiment_confidence,text
0,neutral,1.0000,What said
1,positive,0.3486,plus youve added commercials to the experienc...
2,neutral,0.6837,I didnt today Must mean I need to take anothe...
3,negative,1.0000,its really aggressive to blast obnoxious ente...
4,negative,1.0000,and its a really big bad thing about it
...,...,...,...
14634,positive,0.3487,thank you we got on a different flight to Chi...
14635,negative,1.0000,leaving over minutes Late Flight No warnings...
14636,neutral,1.0000,Please bring American Airlines to BlackBerry
14637,negative,1.0000,you have my money you change my flight and do...


In [294]:
sum(tweets["airline_sentiment"] == "negative")/len(tweets)

0.6269553931279459

In [12]:
tweets.to_csv("../Data/Cleaned/CleanTweets.csv")

## Data transformation

#### Map Tweets into Vectors in R^n

In [None]:
sentence_vectors = tweets["text"].apply(lambda x: bert_encode(x))
data = np.array(list(map(lambda x: x[0], sentence_vectors)))

#### Store Data as a CSV so don't need to re-encode every time

In [None]:
#data_frame = pd.DataFrame(data)
#data_frame.to_txt("../Data/Cleaned/BERT_Vectors.csv")

In [None]:
#pq.write_table(table, "../Data/Cleaned/BERT_Vectors.oarquet")

#### BERT Encodings can be found in data below

In [9]:
Bert_Encoded_Text = pd.read_parquet("../Data/Cleaned/BERT_Vectors.oarquet")
data = Bert_Encoded_Text.to_numpy()

In [10]:
Bert_Encoded_Text_raw = pd.read_parquet("../Data/Cleaned/TWITTERBERT_Vectors1.parquet")
data_unclean = Bert_Encoded_Text_raw.to_numpy()

In [12]:
data_unclean.shape

(14639, 768)

### K-means using personal implementation

In [13]:
def cos_distance(x1, x2):
    '''
    x1: numpy array, shape = [D]
    x2: numpy array, shape = [D]
    RETURN
        dist: float value
    '''
    #dist = np.sqrt(np.dot(x1-x2,x1-x2))
    x1_length = np.dot(x1, x1)
    x2_length = np.dot(x2, x2)
    cosine = np.dot(x1, x2)/(x1_length*x2_length)
    dist = np.arccos(cosine)/np.arccos(-1)

    ## end
    return dist

In [32]:
def closestCentroid(input_vector, centroid_dict, distance):
    '''
    coordinates_x: numpy array, shape = [D]
    coordinates_centroid: dictionary, key = int, value = numpy array of shape [D]
    RETURN
        closest_centroid: int value
    '''
    closest_centroid = None

    ## start your code here
    smallest_dist = 10**10

    for key in centroid_dict:
        curr_centroid_vector = centroid_dict[key]
        curr_centroid_vector = curr_centroid_vector.reshape(-1)
        curr_dist = distance(curr_centroid_vector, input_vector)
        
        if (curr_dist < smallest_dist):
            closest_centroid = key
            smallest_dist = curr_dist
    
    ## end
    return closest_centroid

When using arccos(cosine_similarity)/π as the distance metric, the standard arithmetic mean used in k-means won’t work correctly because it doesn’t preserve unit directionality. 

Instead, we need to compute the centroid in a way that maintains the directional nature of the embeddings. 

The best approach is to use the normalized mean vector.

In [15]:
def compute_new_centroid(cluster_vectors):
    if len(cluster_vectors) == 0:
        return None  # Handle empty clusters
    
    cluster_vectors = [vec / np.linalg.norm(vec) for vec in cluster_vectors] 
    
    mean_vector = np.mean(cluster_vectors, axis=0).reshape(-1)  # Step 1: Compute the mean vector
    norm = np.linalg.norm(mean_vector)  # Step 2: Compute its norm
    
    if norm == 0:
        return np.zeros_like(mean_vector)  # Edge case: if the norm is 0, return a zero vector
    
    return mean_vector / norm  # Step 3: Normalize to get unit vector

In [19]:
def KMeansClustering(X, initial_centroids, distance, k, n, tolerance=1e-20):
    '''
    X: numpy array, shape = [N, D]
    index_centroids: list, shape = k
    k: int value
    n: int value
    RETURN
        repartition: dictionary, key = int, value = numpy array of shape [number of points in cluster, D]
        coordinates: dictionary, key = int, value = numpy array of shape [D]
    '''


    repartition, centroids,  = None, dict()
    ## start your code here
    # Initialise your first centroids
    for i in range(k):
        centroids[i] = initial_centroids[i]
        
    # Define stopping criterion
    for i in range(n):
        # Initialise new dictionaries for repartition and coordinates
        repartition = dict()
        for i in range(k):
            repartition[i] = []
            
        # Assign all the points to the closest cluster centroid
        for vector in X:
            repartition[closestCentroid(vector, centroids, distance)].append(vector)

        
        # Compute new centroids
        new_centroids = {}
        for cluster_key in repartition:
            new_centroids[cluster_key] = compute_new_centroid(repartition[cluster_key])
        
        # Check for early stopping (convergence)
        centroid_changes = [np.linalg.norm(new_centroids[cluster_key] - centroids[cluster_key]) for cluster_key in centroids]
        max_change = max(centroid_changes)

        # If the max change in centroids is smaller than the tolerance, stop early
        if max_change < tolerance:
            print(f"Converged early after {i+1} iterations.")
            return repartition, centroids
        
        # Update centroids for the next iteration
        centroids = new_centroids
        
    
    ## end
    return repartition, centroids

### K-Means using sklearn

In [21]:
def sklearnKmeans(X, k, m):
    '''
    X: numpy array, shape = [N, D]
    k: int value
    m: int value
    RETURN
        position: numpy array, shape = [N]
        centers: numpy array, shape = [k, D]
    '''
    position, centers = None, None
    ## start your code here
    kmeans = KMeans(n_clusters=k, init="random", n_init=1, max_iter=m).fit(X)
    position, centers = kmeans.predict(X), kmeans.cluster_centers_
    ## end
    return position, centers

In [22]:
def sklearnKmeans_plus(X, k, m):
    '''
    X: numpy array, shape = [N, D]
    k: int value
    m: int value
    RETURN
        position: numpy array, shape = [N]
        centers: numpy array, shape = [k, D]
    '''
    position, centers = None, None
    ## start your code here
    kmeans = KMeans(n_clusters=k, init="k-means++", n_init=1, max_iter=m).fit(X)
    position, centers = kmeans.predict(X), kmeans.cluster_centers_
    ## end
    return position, centers

## Modelling the Data

In [20]:
def euclideanDist(x1, x2):
    '''
    x1: numpy array, shape = [D]
    x2: numpy array, shape = [D]
    RETURN
        dist: float value
    '''
    dist = np.sqrt(np.dot(x1-x2,x1-x2))
    ## start your code here
    
    
    ## end
    return dist

In [30]:
def closest_to_Centroid(clusters, centroids, distance):

    cluster_representatives = dict()
    for cluster in clusters.keys():
        cluster_representatives[cluster] = []

    for cluster in clusters.keys():
        curr_centroid = centroids[cluster]
        smallest_dist = 10**10
        closest_datapoint = clusters[cluster][0]

        for vector in clusters[cluster]:
            curr_dist = distance(vector, curr_centroid)
            if (curr_dist < smallest_dist):
                closest_datapoint = vector
                smallest_dist = curr_dist
        
        cluster_representatives[cluster] = closest_datapoint
        
    ## end

    return cluster_representatives

### Initial attempt using SK-Learn Library

In [24]:
position, centers = sklearnKmeans(data, 3, 10000)

In [25]:
clusters = {0:[], 1:[], 2:[]}
N = len(position)

for index in range(N):
    clusters[position[index]].append(data[index])

cluster_representatives = closest_to_Centroid(clusters, centers, euclideanDist)
cluster_rep_indexes = [np.where(data == rep)[0][0] for rep in cluster_representatives.values()]

# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]
labels = dict()

for cluster in possible_clusters:
    labels[cluster] = list(tweets["airline_sentiment"][cluster_rep_indexes])[cluster] 

predicted_sentiments = list(map(lambda x: labels[x], position))

accuracy = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments)
print(f"accuracy is {np.round(accuracy*100, 2)}%")

accuracy is 39.7%


Accuracy achieved in First attempt is ≈ 39.70%

### Second attempt using SK-Learn Library (K-Means++)

In [26]:
position, centers = sklearnKmeans_plus(data, 3, 10000)

In [27]:
clusters = {0:[], 1:[], 2:[]}
N = len(position)

for index in range(N):
    clusters[position[index]].append(data[index])

cluster_representatives = closest_to_Centroid(clusters, centers, euclideanDist)
cluster_rep_indexes = [np.where(data == rep)[0][0] for rep in cluster_representatives.values()]

# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]
labels = dict()

for cluster in possible_clusters:
    labels[cluster] = list(tweets["airline_sentiment"][cluster_rep_indexes])[cluster] 

predicted_sentiments = list(map(lambda x: labels[x], position))

accuracy = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments)
print(f"accuracy is {np.round(accuracy*100, 2)}%")

accuracy is 40.08%


Accuracy achieved in Second attempt is ≈ 40.08%

### Third Attempt using own K-Means Clustering algorithm

In [37]:
initial_centroids = [np.array(data[random.randint(0, len(data) - 1)]) for i in range(3)]
clusters2, centers2 = KMeansClustering(data, initial_centroids, cos_distance, 3, 500)

Converged early after 3 iterations.


In [39]:
clusters2 = {0:[], 1:[], 2:[]}
N = len(position)

for index in range(N):
    clusters2[position[index]].append(data[index])

cluster_representatives2 = closest_to_Centroid(clusters2, centers2, cos_distance)
cluster_rep_indexes2 = [np.where(data == rep)[0][0] for rep in cluster_representatives2.values()]

# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]
labels2 = dict()

for cluster in possible_clusters:
    labels2[cluster] = list(tweets["airline_sentiment"][cluster_rep_indexes2])[cluster] 

predicted_sentiments2 = list(map(lambda x: labels2[x], position))

In [40]:
accuracy2 = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments2)
print(f"accuracy is {np.round(accuracy2*100, 2)}%")

accuracy is 48.23%


Accuracy achieved in Third attempt is ≈ 56.07%

### Fourth Attempt using own K-Means++ Clustering algorithm

In [41]:
def centroid_initalizer(data, k, distance_func):
    
    n_samples = data.shape[0]
    # Step 1: Choose the first centroid randomly
    first_index = random.randint(0, n_samples - 1)
    centroids = [data[first_index]]
    
    for _ in range(1, k):
        # Compute the minimum distance to any chosen centroid
        distances = np.array([min(distance_func(x, c) for c in centroids) for x in data])
        
        # Normalize distances to form a probability distribution
        probabilities = distances / distances.sum()
        
        # Select next centroid based on weighted probability distribution
        next_index = np.random.choice(n_samples, p=probabilities)
        centroids.append(data[next_index])
    
    return centroids

In [43]:
initial_centroids = centroid_initalizer(data, 3, cos_distance)
clusters2, centers2 = KMeansClustering(data, initial_centroids, cos_distance, 3, 500)

Converged early after 3 iterations.


In [45]:
clusters2 = {0:[], 1:[], 2:[]}
N = len(position)

for index in range(N):
    clusters2[position[index]].append(data[index])

cluster_representatives2 = closest_to_Centroid(clusters2, centers2, cos_distance)
cluster_rep_indexes2 = [np.where(data == rep)[0][0] for rep in cluster_representatives2.values()]

# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]
labels2 = dict()

for cluster in possible_clusters:
    labels2[cluster] = list(tweets["airline_sentiment"][cluster_rep_indexes2])[cluster] 

predicted_sentiments2 = list(map(lambda x: labels2[x], position))

In [46]:
accuracy2 = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments2)
print(f"accuracy is {np.round(accuracy2*100, 2)}%")

accuracy is 39.28%


Accuracy achieved in Fourth attempt is ≈ 56.07%

## Fifth Attempt

In [63]:
initial_centroids = centroid_initalizer(data_unclean, 3, cos_distance)
clusters2, centers2 = KMeansClustering(data_unclean, initial_centroids, cos_distance, 3, 500)

Converged early after 3 iterations.


In [58]:
clusters2 = {0:[], 1:[], 2:[]}
N = len(position)

for index in range(N):
    clusters2[position[index]].append(data_unclean[index])

cluster_representatives2 = closest_to_Centroid(clusters2, centers2, cos_distance)
cluster_rep_indexes2 = [np.where(data_unclean == rep)[0][0] for rep in cluster_representatives2.values()]

# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]
labels2 = dict()

for cluster in possible_clusters:
    labels2[cluster] = list(tweets["airline_sentiment"][cluster_rep_indexes2])[cluster] 

predicted_sentiments2 = list(map(lambda x: labels2[x], position))

In [65]:
accuracy2 = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments2)
print(f"accuracy is {np.round(accuracy2*100, 2)}%")

accuracy is 56.07%


### Think of new ways to improve

In [61]:
position, centers = sklearnKmeans_plus(data_unclean, 3, 10000)

In [64]:
clusters = {0:[], 1:[], 2:[]}
N = len(position)

for index in range(N):
    clusters[position[index]].append(data_unclean[index])

cluster_representatives = closest_to_Centroid(clusters, centers, euclideanDist)
cluster_rep_indexes = [np.where(data_unclean == rep)[0][0] for rep in cluster_representatives.values()]

# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]
labels = dict()

for cluster in possible_clusters:
    labels[cluster] = list(tweets["airline_sentiment"][cluster_rep_indexes])[cluster] 

predicted_sentiments = list(map(lambda x: labels[x], position))

accuracy = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments)
print(f"accuracy is {np.round(accuracy*100, 2)}%")

accuracy is 62.7%


#### PCA Analysis

In [60]:
scaler = StandardScaler()
bert_vectors_std = scaler.fit_transform(Bert_Encoded_Text)  # Shape: (N, 768)

# Choose the number of components
n_components = 50 # Example: Reduce to 50 dimensions
pca = PCA(n_components=n_components)

bert_pca = pca.fit_transform(bert_vectors_std)  # Shape: (N, 50)

In [255]:
initial_centroids = centroid_initalizer(bert_pca, 3, cos_distance)
clusters2, centers2 = KMeansClustering(bert_pca, initial_centroids, 3, 500)

Converged early after 3 iterations.


In [256]:
clusters2 = {0:[], 1:[], 2:[]}
N = len(position)

for index in range(N):
    clusters2[position[index]].append(bert_pca[index])

cluster_representatives2 = closest_to_Centroid(clusters2, centers2, cos_distance)
cluster_rep_indexes2 = [np.where(bert_pca == rep)[0][0] for rep in cluster_representatives2.values()]

# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]
labels2 = dict()

for cluster in possible_clusters:
    labels2[cluster] = list(tweets["airline_sentiment"][cluster_rep_indexes2])[cluster] 

predicted_sentiments2 = list(map(lambda x: labels2[x], position))

In [257]:
accuracy2 = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments2)
print(f"accuracy is {np.round(accuracy2*100, 2)}%")

accuracy is 35.64%


Accuracy achieved in Sixth attempt is ≈ 35.64%

### PCA Might Not Be the Best Approach
#### PCA finds linear patterns, but BERT vectors often have nonlinear structure.

## UMAP (Uniform Manifold Approximation and Projection)

### UMAP with K-Means++

In [217]:
import umap.umap_ as um

In [None]:
umap = um.UMAP(n_components=50, random_state=42)
bert_umap = umap.fit_transform(Bert_Encoded_Text)

In [228]:
initial_centroids = centroid_initalizer(bert_umap, 3, cos_distance)
clusters2, centers2 = KMeansClustering(bert_umap, initial_centroids, 3, 500)

Converged early after 3 iterations.


In [230]:
clusters2 = {0:[], 1:[], 2:[]}
N = len(position)

for index in range(N):
    clusters2[position[index]].append(bert_umap[index])

cluster_representatives2 = closest_to_Centroid(clusters2, centers2, cos_distance)
cluster_rep_indexes2 = [np.where(bert_umap == rep)[0][0] for rep in cluster_representatives2.values()]

# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]
labels2 = dict()

for cluster in possible_clusters:
    labels2[cluster] = list(tweets["airline_sentiment"][cluster_rep_indexes2])[cluster] 

predicted_sentiments2 = list(map(lambda x: labels2[x], position))

In [231]:
accuracy2 = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments2)
print(f"accuracy is {np.round(accuracy2*100, 2)}%")

accuracy is 54.55%


Accuracy achieved in Seventh attempt is ≈ 54.55%

##### It seems like BERT Encoding with Cosine-distance K-Means++ still outperforms it

### UMAP with HDBSCAN

In [None]:
import hdbscan
from sklearn.metrics import silhouette_score

In [None]:
# Initialize HDBSCAN
clusterer = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=3, metric='cosine')

# Fit the model
clusterer.fit(Bert_Encoded_Text)

In [44]:
labels = clusterer.labels_
print(f"Cluster Labels: {labels}")

probabilities = clusterer.probabilities_
print(f"Cluster Probabilities: {probabilities}")

n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
print(f"Number of clusters: {n_clusters}")

NameError: name 'clusterer' is not defined

### I should keep a table of all the different models with their accuracy and runtimes

In [301]:
## new approach create 3 vectors "negative", "positive", "neutral"

sentiment_as_text = list(map(bert_encode, ["negative bad unhappy", "positive good happy", "happy"]))
basis_vectors = np.array(list(map(lambda x: x[0], sentiment_as_text)))


In [286]:
def make_it_hashable(arr):
    return tuple(arr)

In [None]:
## new approach create 3 vectors "negative", "positive", "neutral"

sentiment_as_text = list(map(bert_encode, ["negative", "positive", "neutral"]))
basis_vectors = np.array(list(map(lambda x: x[0], sentiment_as_text)))


pseudo_clusters = {"negative":[], "positive":[], "neutral":[]}
e1,e2,e3 = basis_vectors
e1,e2,e3 = tuple(e1), tuple(e2), tuple(e3)
basis_vector_labels = {e1: "negative", e2: "positive", e3: "neutral"}

for tweet_vector in data:
    curr_closet_basis_vector = 0
    curr_smallest_cos_dist = 1
    for basis_vector in basis_vectors:
        distance = cos_distance(tweet_vector, basis_vector)
        if (distance < curr_smallest_cos_dist):
            curr_closet_basis_vector = basis_vector

    pseudo_clusters[basis_vector_labels[make_it_hashable(curr_closet_basis_vector)]].append(tweet_vector)




In [306]:


# Example representative sentences for each sentiment
positive_sentences = ["Yay I am happy!", "This makes me pleased.", "Good job airline!"]
negative_sentences = ["This is terrible!", "I hate this.", "Awful experience!"]
neutral_sentences = ["It's okay.", "Not bad, not great.", "It was fine."]

# Get BERT embeddings
positive_vectors = np.array([bert_encode(s)[0] for s in positive_sentences])
negative_vectors = np.array([bert_encode(s)[0] for s in negative_sentences])
neutral_vectors = np.array([bert_encode(s)[0] for s in neutral_sentences])

# Compute a basis for each sentiment by taking the row span
positive_basis = np.linalg.qr(positive_vectors.T)[0]  # QR decomposition to orthogonalize
negative_basis = np.linalg.qr(negative_vectors.T)[0]
neutral_basis = np.linalg.qr(neutral_vectors.T)[0]


In [307]:
def project_onto_basis(vector, basis):
    """Projects vector onto the space spanned by basis."""
    return basis @ (basis.T @ vector)

def classify_tweet(tweet_vector):
    """Assign tweet vector to the closest hyperplane using cosine similarity."""
    proj_positive = project_onto_basis(tweet_vector, positive_basis)
    proj_negative = project_onto_basis(tweet_vector, negative_basis)
    proj_neutral = project_onto_basis(tweet_vector, neutral_basis)

    # Compute cosine similarity with original tweet vector
    cos_sim_positive = np.dot(proj_positive, tweet_vector) / (np.linalg.norm(proj_positive) * np.linalg.norm(tweet_vector))
    cos_sim_negative = np.dot(proj_negative, tweet_vector) / (np.linalg.norm(proj_negative) * np.linalg.norm(tweet_vector))
    cos_sim_neutral = np.dot(proj_neutral, tweet_vector) / (np.linalg.norm(proj_neutral) * np.linalg.norm(tweet_vector))

    similarities = {"positive": cos_sim_positive, "negative": cos_sim_negative, "neutral": cos_sim_neutral}

    return max(similarities, key=similarities.get)  # Assign sentiment with highest similarity


In [308]:
pseudo_clusters = {"positive": [], "negative": [], "neutral": []}

for tweet_vector in data:
    sentiment = classify_tweet(tweet_vector)
    pseudo_clusters[sentiment].append(tweet_vector)


In [313]:
len(pseudo_clusters['neutral'])

515

In [332]:
def classify_tweet_soft(tweet_vector):
    proj_positive = project_onto_basis(tweet_vector, positive_basis)
    proj_negative = project_onto_basis(tweet_vector, negative_basis)
    proj_neutral = project_onto_basis(tweet_vector, neutral_basis)

    cos_sim_positive = np.dot(proj_positive, tweet_vector) / (np.linalg.norm(proj_positive) * np.linalg.norm(tweet_vector))
    cos_sim_negative = np.dot(proj_negative, tweet_vector) / (np.linalg.norm(proj_negative) * np.linalg.norm(tweet_vector))
    cos_sim_neutral = np.dot(proj_neutral, tweet_vector) / (np.linalg.norm(proj_neutral) * np.linalg.norm(tweet_vector))

    scores = np.array([cos_sim_positive, cos_sim_negative, cos_sim_neutral])
    softmax_scores = np.exp(scores) / np.sum(np.exp(scores))  # Softmax for probabilities
    sentiment = np.random.choice(["positive", "negative", "neutral"], p = softmax_scores)

    return sentiment


In [333]:
pseudo_clusters = {"positive": [], "negative": [], "neutral": []}

for tweet_vector in data:
    sentiment = classify_tweet_soft(tweet_vector)
    pseudo_clusters[sentiment].append(tweet_vector)


In [334]:
(len(pseudo_clusters['positive']), len(pseudo_clusters['negative']), len(pseudo_clusters['neutral']))

(5014, 4773, 4852)

In [335]:
(sum(tweets["airline_sentiment"] == 'positive'), sum(tweets["airline_sentiment"] == 'negative'), sum(tweets["airline_sentiment"] == 'neutral'))

(2362, 9178, 3099)