In [69]:
import pandas as pd
import numpy as np
import re
from re import sub
import multiprocessing
from unidecode import unidecode

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors


from time import time 
from collections import defaultdict

from sklearn.cluster import KMeans

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

#text processing, NLP modules
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import statsmodels.api as sm

plt.rcParams['figure.figsize'] = (15, 10)

In [70]:
#!pip install unidecode

In [71]:
#!pip install gensim

In [72]:
df = pd.read_csv("Data/tweety_test.csv")

In [73]:
df_tweety_full = df

In [74]:
stopwords = nltk.corpus.stopwords.words('english')
#newStopWords = ['moderna', 'covaxin', 'covid19', 'pfizerbiontech', 'vaccine', 'sputnikv', 'oxfordastrazeneca', 'covidvaccine', 'pfizer', 'sinovac', 'sinopharm', 'covid19vaccine', 'astrazeneca', 'covid', 'coronavirus', 'vaccines', 'china', 'russia', 'covishield', 'vaccination', 'vaccinated', 'pfizervaccine', 'eu', 'india', 'covid_19', 'bharatbiotech', 'covidvaccination', 'covid19vaccination', 'coronavaccine', 'mrna', 'johnsonandjohnson', 'getvaccinated', 'biontech', 'narendramodi', 'hongkong', 'pakistan', 'covidvacccine', 'pmmodi', 'modernavaccine', 'ocgn', 'iran', 'nhs', 'coronavirusvaccine', 'ocugen', 'breaking', 'covidvaccines', 'aiims', 'russian', 'pfizercovidvaccine', 'vaccineswork', 'news', 'putin', 'uk', 'who', 'us', 'canada', 'italy', 'covidー19', 'covid19vaccines', 'israel', 'corona', 'hungary', 'zimbabwe', 'pandemic', 'covax', 'oxfordvaccine', 'usa', 'health', 'modi', 'vaccine', 'first', 'covid', 'dose', 'today', '19', 'vaccines', 'amp', 'shot', 'doses']
#stopwords.extend(newStopWords)

In [75]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [76]:
def clean_tweet(x):
    x = str(x)
    x = x.lower()

    #remove twitter handlers
    x = re.sub('@[^\s]+','',x)
    #remove hashtags
    x = re.sub('#[^\s]+','',x)
    #remove URLs
    x = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', x)
    #remove punctuation
    x = x.replace('[^\w\s]','')
    #remove single characters
    x = re.sub(r'\s+[a-zA-Z0-9]\s+', '', x)
    #substitute multiple spaces with single space
    x = re.sub(r'\s+', ' ', x, flags=re.I)
    #remove linebreaks
    x = re.sub('\n', '', x)
    #remove special characters
    #x = ' '.join(re.findall(r'\w+', x))
    #substitute multiple spaces with single space
    x = re.sub(r'\s+', ' ', x, flags=re.I)
    #stemming
    #x = ' '.join([stemmer.stem(word) for word in x.split() ])
    #lemmatizing
    #x = ' '.join([lemmatizer.lemmatize(word) for word in x.split() ])
    #removing stop words
    x = ' '.join([word for word in x.split() if word not in stopwords])
    #removing numbers
    x = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x)
    #x = x.split()

    return x

In [77]:
df_tweety_full.text = df_tweety_full.text.apply(lambda x: clean_tweet(x))

In [78]:
sentences = [row for row in df_tweety_full.text]
#phrases = Phrases(sent, min_count=1, progress_per=50000)
#bigram = Phraser(phrases)
#sentences = bigram[sent]

In [79]:
#sentences

In [80]:
w2v_model = Word2Vec(min_count=3,
                     window=3,
                     vector_size=300,
                     sample=1e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=multiprocessing.cpu_count()-1
                    )

start = time()

w2v_model.build_vocab(sentences, progress_per=50000)

print('Time to build vocab: {} mins'.format(round((time() - start) / 60, 2)))

Time to build vocab: 0.0 mins


In [81]:
start = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=50, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - start) / 60, 2)))

Time to train the model: 0.01 mins


In [82]:
w2v_model.save("word2vec.model")

In [83]:
df_tweety_exp = df_tweety_full.copy()
#df_tweety_exp['old_text'] = df_tweety_exp.text
#df_tweety_exp.old_text = df_tweety_exp.old_text.str.join(' ')
#df_tweety_exp.text = df_tweety_exp.text.apply(lambda x: ' '.join(bigram[x]))
#df_tweety_exp.label = df_tweety_exp.label.astype('int8')

In [84]:
#df_tweety_exp

In [85]:
df_tweety_exp[['text', 'label']].to_csv('Data/clean_tweety_test.csv', index=False)

# Clustering

In [94]:
word_vectors = Word2Vec.load("word2vec.model").wv

In [89]:
# Using PCA for Dimensionality Reduction
# And the StandardScaler to scale the data 
#scaler = StandardScaler()
#X_scaled = scaler.fit_transform(word_vectors.vectors.astype('double'))
#pca_ = PCA(0.99, random_state=0)
#X_pca=pca_.fit_transform(X_scaled)

In [63]:
#X_pca

In [95]:
model = KMeans(n_clusters=3, max_iter=1000, random_state=True, n_init=50).fit(X=word_vectors.vectors.astype('double'))

In [97]:
word_vectors.similar_by_vector(model.cluster_centers_[1], topn=50, restrict_vocab=None)

[('z', 0.999841570854187),
 ('3', 0.9998366236686707),
 ('9', 0.9998340010643005),
 ('_', 0.9998263716697693),
 ('1', 0.9997838735580444),
 ('2', 0.9997785091400146),
 ('b', 0.9997747540473938),
 ('5', 0.9997707605361938),
 ('s', 0.9997676610946655),
 ('6', 0.9997633695602417),
 ('f', 0.9997622966766357),
 ('x', 0.9997585415840149),
 ('0', 0.9997578263282776),
 ('j', 0.9997577667236328),
 ('k', 0.9997559785842896),
 ('d', 0.9997557401657104),
 ('c', 0.9997556805610657),
 ('r', 0.9997546076774597),
 ('8', 0.999748706817627),
 ('o', 0.9997475147247314),
 ('h', 0.9997465014457703),
 ('t', 0.9997363090515137),
 (' ', 0.9997357130050659),
 ('a', 0.9997343420982361),
 ('g', 0.999725341796875),
 ('m', 0.9997137784957886),
 ('y', 0.9997133016586304),
 ('i', 0.9997097849845886),
 ('q', 0.9997084736824036),
 ('w', 0.9997076392173767),
 ('l', 0.9997071027755737),
 ('n', 0.9996988773345947),
 ('p', 0.9996975660324097),
 ('v', 0.99968022108078),
 ('u', 0.9996739029884338),
 ('e', 0.9996440410614014

In [31]:
positive_cluster_index = 2
negative_cluster_index = 0
neutral_cluster_index = 1
positive_cluster_center = model.cluster_centers_[positive_cluster_index]
negative_cluster_center = model.cluster_centers_[negative_cluster_index]
neutral_cluster_center = model.cluster_centers_[neutral_cluster_index]

In [32]:
words = pd.DataFrame(word_vectors.key_to_index.keys())
words.columns = ['words']
words['vectors'] = words.words.apply(lambda x: word_vectors[f'{x}'])
words['cluster'] = words.vectors.apply(lambda x: model.predict([np.array(x)]))
words.cluster = words.cluster.apply(lambda x: x[0])

In [33]:
words['cluster_value'] = [(1 if i==positive_cluster_index else (-1 if i==negative_cluster_index else 0)) for i in words.cluster]
words['closeness_score'] = words.apply(lambda x: 1/(model.transform([x.vectors]).min()), axis=1)
words['sentiment_coeff'] = words.closeness_score * words.cluster_value

In [34]:
words.head(20)

Unnamed: 0,words,vectors,cluster,cluster_value,closeness_score,sentiment_coeff
0,,"[0.0572564, 0.10393967, -0.012398504, -0.03266...",0,-1,28.016052,-28.016052
1,e,"[0.054352034, 0.10804684, -0.015289383, -0.036...",0,-1,27.850462,-27.850462
2,a,"[0.058876187, 0.10179549, -0.014092889, -0.034...",0,-1,29.460437,-29.460437
3,t,"[0.055352136, 0.10312706, -0.017193167, -0.033...",0,-1,30.403961,-30.403961
4,i,"[0.05614004, 0.10228333, -0.011227658, -0.0379...",0,-1,28.05864,-28.05864
5,o,"[0.05442318, 0.104929745, -0.014086159, -0.033...",0,-1,29.334049,-29.334049
6,r,"[0.060428027, 0.101003505, -0.01687432, -0.035...",0,-1,30.348205,-30.348205
7,n,"[0.05708501, 0.10665889, -0.011507071, -0.0320...",0,-1,29.370976,-29.370976
8,s,"[0.05470899, 0.10282731, -0.013131223, -0.0377...",0,-1,24.877256,-24.877256
9,l,"[0.057748895, 0.10562736, -0.015056856, -0.034...",0,-1,28.745074,-28.745074


In [35]:
words[['words', 'sentiment_coeff']].to_csv('sentiment_dictionary.csv', index=False)

In [36]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from IPython.display import display

In [37]:
final_file = pd.read_csv('Data/clean_tweety_test.csv')

In [38]:
final_file

Unnamed: 0,text,label
0,get,0
1,author unit state last night,0
2,immigr muslim coupl find,0
3,ladi gentleman expert everi household,0
4,fact sheet healthcar provid administ,0
...,...,...
995,side effect,0
996,seen trend twitter let give scientistsbig cred...,1
997,got clarif,-1
998,approv use,0


In [39]:
sentiment_map = pd.read_csv('sentiment_dictionary.csv')
sentiment_dict = dict(zip(sentiment_map.words.values, sentiment_map.sentiment_coeff.values))

In [40]:
#sentiment_dict

In [41]:
file_weighting = final_file.copy()
tfidf = TfidfVectorizer(tokenizer=lambda y: y.split(), norm=None)
tfidf.fit(file_weighting.text.astype('U'))
features = pd.Series(tfidf.get_feature_names())
transformed = tfidf.transform(file_weighting.text.astype('U'))



In [42]:
def create_tfidf_dictionary(x, transformed_file, features):
    '''
    create dictionary for each input sentence x, where each word has assigned its tfidf score
    
    inspired  by function from this wonderful article: 
    https://medium.com/analytics-vidhya/automated-keyword-extraction-from-articles-using-nlp-bfd864f41b34
    
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer

    '''
    vector_coo = transformed_file[x.name].tocoo()
    vector_coo.col = features.iloc[vector_coo.col].values
    dict_from_coo = dict(zip(vector_coo.col, vector_coo.data))
    return dict_from_coo

def replace_tfidf_words(x, transformed_file, features):
    '''
    replacing each word with it's calculated tfidf dictionary with scores of each word
    x - row of dataframe, containing sentences, and their indexes,
    transformed_file - all sentences transformed with TfidfVectorizer
    features - names of all words in corpus used in TfidfVectorizer
    '''
    dictionary = create_tfidf_dictionary(x, transformed_file, features)   
    return list(map(lambda y:dictionary[f'{y}'], x.text.split()))

In [43]:
%%time
replaced_tfidf_scores = file_weighting.apply(lambda x: replace_tfidf_words(x.text, transformed, features), axis=1)#this step takes around 3-4 minutes minutes to calculate

AttributeError: 'str' object has no attribute 'name'

In [44]:
replaced_closeness_scores = file_weighting.text.apply(lambda x: list(map(lambda y: replace_sentiment_words(y, sentiment_dict), x.split())))

NameError: name 'replace_sentiment_words' is not defined

In [1175]:
replacement_df = pd.DataFrame(data=[replaced_closeness_scores, replaced_tfidf_scores, file_weighting.text, file_weighting.label]).T
replacement_df.columns = ['sentiment_coeff', 'tfidf_scores', 'sentence', 'sentiment']
replacement_df['sentiment_rate'] = replacement_df.apply(lambda x: np.array(x.loc['sentiment_coeff']) @ np.array(x.loc['tfidf_scores']), axis=1)
replacement_df['prediction'] = (replacement_df.sentiment_rate>0).astype('int8')
replacement_df['sentiment'] = [(1 if i==1 else (-1 if i==-1 else 0)) for i in replacement_df.sentiment]
words['cluster_value'] = [(1 if i==1 else (-1 if i==-1 else 0)) for i in words.cluster]

replacement_df#.sentiment.value_counts()/len(replacement_df)

Unnamed: 0,sentiment_coeff,tfidf_scores,sentence,sentiment,sentiment_rate,prediction
0,"[0, 0]","[3.781620394270129, 5.711530201979001]",vaccin get,0,0.0,0
1,"[0, 0, 0, 0, 0]","[6.810142490647111, 6.810142490647111, 6.81014...",author unit state last_night,0,0.0,0
2,"[0, 0, 0, 0, 0]","[6.810142490647111, 7.215607598755275, 7.21560...",immigr muslim coupl find vaccin,0,0.0,0
3,"[0, 0, 0, 0, 0]","[7.215607598755275, 7.215607598755275, 6.81014...",ladi gentleman expert everi household,0,0.0,0
4,"[0, 0, 0, 0, 0, 0]","[6.810142490647111, 6.810142490647111, 7.21560...",fact_sheet healthcar provid administ vaccin,0,0.0,0
...,...,...,...,...,...,...
995,"[0, 0]","[6.52246041819533, 4.91302250576123]",side_effect,0,0.0,0
996,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[7.215607598755275, 7.215607598755275, 7.21560...",seen trend twitter let give scientistsbig cred...,1,0.0,0
997,"[0, 0]","[4.058607177605162, 7.215607598755275]",got clarif,-1,0.0,0
998,"[0, 0, 0]","[6.29931686688112, 2.383301840183437, 5.962844...",approv vaccin use,0,0.0,0


In [1076]:
replacement_df.sentiment_rate.unique()

array([0.])

In [807]:
y_pred = replacement_df.prediction
y_test = replacement_df.sentiment

In [808]:
conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix')
display(conf_matrix)

test_scores = accuracy_score(y_test,y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred)

print('\n \n Scores')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix


Unnamed: 0,0,1,2
0,0,0,118
1,0,2,592
2,0,0,288


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [764]:
conf_matrix = pd.DataFrame(confusion_matrix(replacement_df.sentiment, replacement_df.prediction))
print('Confusion Matrix\n')
print(conf_matrix)

#importing accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('\nAccuracy: {:.2f}\n'.format(accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}'.format(f1_score(y_test, y_pred, average='weighted')))

from sklearn.metrics import classification_report
print('\nClassification Report\n')
print(classification_report(y_test, y_pred, target_names=['Class 1', 'Class 2', 'Class 3']))

Confusion Matrix

   0    1  2
0  0  118  0
1  0  594  0
2  0  288  0

Accuracy: 0.59

Micro Precision: 0.59
Micro Recall: 0.59
Micro F1-score: 0.59

Macro Precision: 0.20
Macro Recall: 0.33
Macro F1-score: 0.25

Weighted Precision: 0.35
Weighted Recall: 0.59
Weighted F1-score: 0.44

Classification Report

              precision    recall  f1-score   support

     Class 1       0.00      0.00      0.00       118
     Class 2       0.59      1.00      0.75       594
     Class 3       0.00      0.00      0.00       288

    accuracy                           0.59      1000
   macro avg       0.20      0.33      0.25      1000
weighted avg       0.35      0.59      0.44      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
