# IT1244 Project

## Import Libraries

In [92]:
import re as re
import heapq as heapq
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MultiLabelBinarizer, Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans

### Import Bert Encoders

In [34]:
import torch as torch
from transformers import BertModel
from transformers import BertTokenizer, BertTokenizerFast

### Testing Bert Encoding

In [None]:

# Load tokenizer and model
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Encode sentences
sentence1 = "This is an example sentence with change negative."
inputs1 = tokenizer(sentence1, return_tensors="pt")

sentence2 = "This is an example sentence with change upset."
inputs2 = tokenizer(sentence2, return_tensors="pt")

sentence3 = "This is an example sentence with change happy."
inputs3 = tokenizer(sentence3, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)
    outputs3 = model(**inputs3)

# Extract [CLS] token embedding (sentence-level representation)
Vector_X = outputs1.last_hidden_state[:, 0, :]
Vector_Y = outputs2.last_hidden_state[:, 0, :]
Vector_Z = outputs3.last_hidden_state[:, 0, :]

In [69]:
float(cosine_similarity(Vector_X, Vector_Y)[0][0])

0.9878990054130554

In [70]:
float(cosine_similarity(Vector_X, Vector_Z)[0][0])

0.982489287853241

In [71]:
def bert_encode(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state[:, 0, :]


We can see that the closer concepts are have higher cosine similiarity

## Data Importing

In [24]:
tweets = pd.read_csv("../Data/Raw/Tweets.csv")
tweets.head()

Unnamed: 0,airline_sentiment,sentiment_confidence,text
0,neutral,1.0,@VirginAmerica What @dhepburn said.
1,positive,0.3486,@VirginAmerica plus you've added commercials t...
2,neutral,0.6837,@VirginAmerica I didn't today... Must mean I n...
3,negative,1.0,@VirginAmerica it's really aggressive to blast...
4,negative,1.0,@VirginAmerica and it's a really big bad thing...


### Code starts here

In [25]:
tweets.shape


(14639, 3)

In [26]:
tweets["text"][1]

"@VirginAmerica plus you've added commercials to the experience... tacky."

In [27]:
def at_filter(text):
    return re.sub(r"@\w+", "", text)

In [28]:
def alpha_filter(text):
    return re.sub(r'[^A-Za-z ]', '', text)

## Data Cleaning

In [29]:
tweets["text"] = tweets["text"].apply(lambda x: at_filter(x))


In [30]:
tweets["text"] = tweets["text"].apply(lambda x: alpha_filter(x))

In [31]:
tweets

Unnamed: 0,airline_sentiment,sentiment_confidence,text
0,neutral,1.0000,What said
1,positive,0.3486,plus youve added commercials to the experienc...
2,neutral,0.6837,I didnt today Must mean I need to take anothe...
3,negative,1.0000,its really aggressive to blast obnoxious ente...
4,negative,1.0000,and its a really big bad thing about it
...,...,...,...
14634,positive,0.3487,thank you we got on a different flight to Chi...
14635,negative,1.0000,leaving over minutes Late Flight No warnings...
14636,neutral,1.0000,Please bring American Airlines to BlackBerry
14637,negative,1.0000,you have my money you change my flight and do...


In [94]:
X_train, X_test, y_train, y_test = train_test_split(tweets["text"], tweets["airline_sentiment"], test_size=0.2, random_state=42)

In [32]:
tweets.to_csv("../Data/Cleaned/CleanTweets.csv")

In [33]:
max(list(map(lambda x: len(x), tweets["text"])))

166

## Data transformation

In [72]:
sentence_vectors = tweets["text"].apply(lambda x: bert_encode(x))

In [85]:
data = np.array(list(map(lambda x: x[0], sentence_vectors)))

In [90]:
data.shape

(14639, 768)

In [93]:
def sklearnKmeans(X, k, m):
    '''
    X: numpy array, shape = [N, D]
    k: int value
    m: int value
    RETURN
        position: numpy array, shape = [N]
        centers: numpy array, shape = [k, D]
    '''
    position, centers = None, None
    ## start your code here
    kmeans = KMeans(n_clusters=k,n_init=1, max_iter=m).fit(X)
    position, centers = kmeans.predict(X), kmeans.cluster_centers_
    ## end
    return position, centers

## Modelling the Data

In [207]:
position, centers = sklearnKmeans(data, 3, 10000)
labels = dict()

In [208]:
centroid_indexes = [np.where(data == center)[0] for center in centers]
centroid_indexes

[array([12427]), array([652]), array([], dtype=int64)]

In [209]:
centroid_indexes = list(filter(lambda x: x.size > 0, centroid_indexes))
centroid_indexes

[array([12427]), array([652])]

In [210]:
# labelling the centroids to test accuracy
possible_labels = ["negative", "neutral", "positive"]
possible_clusters = [0, 1, 2]

# Cluster 1
sentiment_label = tweets["airline_sentiment"][centroid_indexes[0]].to_string().split()[1]
cluster = position[centroid_indexes[0]][0]
labels[int(cluster)] = sentiment_label
possible_labels.remove(sentiment_label)
possible_clusters.remove(cluster)

# Cluster 2
sentiment_label = tweets["airline_sentiment"][centroid_indexes[1]].to_string().split()[1]
cluster = position[centroid_indexes[1]][0]
labels[int(cluster)] = sentiment_label
possible_labels.remove(sentiment_label)
possible_clusters.remove(cluster)

# Cluster 3
labels[possible_clusters[0]] = possible_labels[0]

In [211]:
labels

{1: 'positive', 2: 'negative', 0: 'neutral'}

In [215]:
predicted_sentiments = list(map(lambda x: labels[x], position))

In [225]:
accuracy = np.mean(tweets["airline_sentiment"] ==  predicted_sentiments)
print(f"accuracy is {np.round(accuracy*100, 2)}%")

accuracy is 40.08%
