## Load libraries and data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import regex as re

!pip install gensim
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics.pairwise import cosine_similarity




In [3]:

glove_file = '/content/drive/MyDrive/TEXT MINING/GloVe/glove.6B.300d.txt'

word2vec_temp_file = get_tmpfile("glove_word2vec.txt")
glove2word2vec(glove_file, word2vec_temp_file)

glove_model = KeyedVectors.load_word2vec_format(word2vec_temp_file)

  glove2word2vec(glove_file, word2vec_temp_file)


In [4]:
# Load collection and emotion data

df = pd.read_csv('/content/drive/MyDrive/TEXT MINING/Data/processed_dataset.csv')

emotion_df = pd.read_csv('/content/drive/MyDrive/TEXT MINING/Data/emotional_vectors.csv')

In [None]:
df.head()

Unnamed: 0,text,label,mapped_emotion,mapped_sentiment,processed_text
0,Man I love reddit.,love,joy,positive,man love reddit
1,Right? Considering it’s such an important docu...,gratitude,joy,positive,right considering is important document should...
2,that's adorable asf,amusement,joy,positive,is adorable asf
3,"""Seeeee! We have one of them coloureds too!""",excitement,joy,positive,see have one coloureds too
4,"Lots, play store or apple store vpn. Nord is good",admiration,joy,positive,lot play store apple store vpn nord is good


In [13]:
df['document_tokens'] = df['processed_text'].apply(lambda x: x.split())
df.head()

Unnamed: 0,text,label,mapped_emotion,mapped_sentiment,processed_text,document_tokens
0,Man I love reddit.,love,joy,positive,man love reddit,"[man, love, reddit]"
1,Right? Considering it’s such an important docu...,gratitude,joy,positive,right considering is important document should...,"[right, considering, is, important, document, ..."
2,that's adorable asf,amusement,joy,positive,is adorable asf,"[is, adorable, asf]"
3,"""Seeeee! We have one of them coloureds too!""",excitement,joy,positive,see have one coloureds too,"[see, have, one, coloureds, too]"
4,"Lots, play store or apple store vpn. Nord is good",admiration,joy,positive,lot play store apple store vpn nord is good,"[lot, play, store, apple, store, vpn, nord, is..."


In [14]:
emotion_set = set(emotion_df['all_words'])
len(emotion_set)

9446

In [None]:
X, y = df[['processed_text', 'document_tokens']], df['mapped_emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

## Train Set processing

In [None]:
X_train

Unnamed: 0,processed_text,document_tokens
12019,thanks not declaring karma whory way like so m...,"[thanks, not, declaring, karma, whory, way, li..."
13058,guess respect boot ground cop are not one comi...,"[guess, respect, boot, ground, cop, are, not, ..."
2130,what will have dinner,"[what, will, have, dinner]"
3290,like scared actually,"[like, scared, actually]"
9111,surprise like get so moody when am hungry laug...,"[surprise, like, get, so, moody, when, am, hun..."
...,...,...
9037,funny how name only lie when you want too amazing,"[funny, how, name, only, lie, when, you, want,..."
13107,haha love happy will see if is interested,"[haha, love, happy, will, see, if, is, interes..."
10568,awesome pic,"[awesome, pic]"
17661,name least ha pretty face great skin name look...,"[name, least, ha, pretty, face, great, skin, n..."


In [5]:
# Extract all the tokens in the given collection and create the set of unique tokens in the collection,
# using as input the column of document tokens

def extract_tokens(list_of_documents):

  all_tokens = [token for sublist in list_of_documents for token in sublist]

  return set(all_tokens)

In [None]:
train_word_set = extract_tokens(X_train['document_tokens'])
len(train_word_set)

11694

In [None]:
train_emo_set = emotion_set.intersection(train_word_set)
len(train_emo_set)

8514

In [6]:
# Extract word embeddings of the GloVe model, given the set of unique words and the embedding dimension.
# If the word is not present in the glove model, fill with a zero vector

def embedding_dict(words_set, glove_model, embedding_dim):

  embedding_dict = {}

  for word in words_set:

    if word in glove_model:
      embedding_dict[word] = glove_model[word]
    else:
      embedding_dict[word] = np.zeros(embedding_dim)

  return embedding_dict

In [None]:
train_temp_dict = embedding_dict(train_emo_set, glove_model, 300)
len(train_temp_dict)

8514

In [None]:
train_embedding_matrix = np.array(list(train_temp_dict.values()))
train_embedding_matrix.shape

(8514, 300)

In [None]:
train_sim_matrix = cosine_similarity(train_embedding_matrix)
train_sim_matrix.shape

(8514, 8514)

In [7]:
# Construct a dictionary of top M similar words, according with the cosine similarity matrix

def m_sim_words(word_list, similarity_matrix, M):

  sim_words = dict()
  sim_indices = np.argsort(-similarity_matrix, axis = 1)[:, 1:M+1]

  for i, word in enumerate(word_list):

    sim_words[word] = [word_list[j] for j in sim_indices[i]]

  return sim_words

In [None]:
train_sim_words = m_sim_words(list(train_temp_dict.keys()), train_sim_matrix, 5)

In [8]:
# Construct a dictionary in which every word embedding is computed using the top M similar words, averaged with it's glove representation

def semantically_similar_features_vectors(m_sim_words, embedding_dict):

  mean_of_m_vectors = dict()

  for word in m_sim_words:
    mean_of_m_vectors[word] = np.mean([embedding_dict[w] for w in m_sim_words[word]], axis = 0)

  fe_dict = dict()

  for word in embedding_dict:
    fe_dict[word] = np.mean((embedding_dict[word], mean_of_m_vectors[word]), axis = 0)

  return fe_dict

In [None]:
train_SSF_dict = semantically_similar_features_vectors(train_sim_words, train_temp_dict)

In [9]:
# Create a dictionary with all words (emotion + simple) taking as input the unique words of a given collection and the realtive emotion dictionary

def complete_dict(words, emotion_dict, model, embedding_dim):

  complete_dict = dict()

  for word in words:
    if word in emotion_dict:
      complete_dict[word] = emotion_dict[word]

    elif word in model:
      complete_dict[word] = model[word]

    else:
      complete_dict[word] = np.zeros(embedding_dim)

  return complete_dict

In [None]:
train_complete_dict = complete_dict(train_word_set, train_SSF_dict, glove_model, 300)
len(train_complete_dict)

11694

## Test Set Processing

In order to avoid data contamination between test_set and train_set words, the previous code is rearranged specifically for the test.

In [None]:
X_test

Unnamed: 0,processed_text,document_tokens
9745,am so glad you were not physically hurt,"[am, so, glad, you, were, not, physically, hurt]"
9532,is great editing know took lot work time aweso...,"[is, great, editing, know, took, lot, work, ti..."
12390,bad news dude you are almost year past,"[bad, news, dude, you, are, almost, year, past]"
8727,lox is optional how dare you put name your fla...,"[lox, is, optional, how, dare, you, put, name,..."
17227,can you show example is more imagined victim h...,"[can, you, show, example, is, more, imagined, ..."
...,...,...
3442,oh alright fair enough haha if sabre not pan o...,"[oh, alright, fair, enough, haha, if, sabre, n..."
9989,thanks very cool wa marriage gift so explains ...,"[thanks, very, cool, wa, marriage, gift, so, e..."
16393,oh damn am clemson student swore had flair,"[oh, damn, am, clemson, student, swore, had, f..."
4036,ahh name you have done again,"[ahh, name, you, have, done, again]"


In [None]:
test_word_set = extract_tokens(X_test['document_tokens'])
len(test_word_set)

5305

In [None]:
test_emo_set = emotion_set.intersection(test_word_set)
len(test_emo_set)

4156

In [None]:
test_temp_dict = embedding_dict(test_emo_set, glove_model, 300)
len(test_temp_dict)

4156

In [None]:
test_embedding_matrix = np.array(list(test_temp_dict.values()))
test_embedding_matrix.shape

(4156, 300)

In [None]:
# test similarity matrix, different from the train one:

test_sim_matrix = cosine_similarity(test_embedding_matrix, train_embedding_matrix)
test_sim_matrix.shape

(4156, 8514)

In [None]:
sim_indices = np.argsort(-test_sim_matrix, axis = 1)[:, 0:5]
len(sim_indices)

4156

In [None]:
len(list(test_temp_dict.keys()))

4156

In [None]:
sim_indices[:10]

array([[6329,  221,  508, 5509, 8440],
       [2141, 1610, 6975,  591, 6147],
       [2142, 2654, 4021, 5036, 2221],
       [   1, 8010, 8083, 3269, 1309],
       [2529, 7589, 4661, 2830, 4021],
       [4152,  599,   41, 1377, 8398],
       [4153, 7063, 6081, 4726, 6379],
       [   3, 5381, 2058, 5533, 4955],
       [5783, 7185, 2291, 3823, 2042],
       [   5, 3197, 4691, 5835, 7274]])

In [10]:
# Construct a link between words in the test set, and the top M similar in the train set

def test_m_sim_words(test_word_list, train_word_list, similarity_matrix, M):

  sim_words = dict()
  sim_indices = np.argsort(-similarity_matrix, axis = 1)[:, 0:M]

  for i, word in enumerate(test_word_list):

    sim_words[word] = [train_word_list[j] for j in sim_indices[i]]

  return sim_words

In [None]:
# in this case we have to pass as input words taken from the train dictionary, because we want to avoid data contamination between the words inside the test

test_sim_words = test_m_sim_words(list(test_temp_dict.keys()), list(train_temp_dict.keys()), test_sim_matrix, 5)
len(test_sim_words)

4156

In [11]:
# Construct word embeddings taking the top M similar train_set words, averaged with the corresponding word in the test_set

def test_sem_sim_features_vectors(m_sim_words, train_embedding_dict, test_embedding_dict):

  mean_of_m_vectors = dict()

  for word in m_sim_words:
    mean_of_m_vectors[word] = np.mean([train_embedding_dict[w] for w in m_sim_words[word]], axis = 0)

  fe_dict = dict()

  for word in test_embedding_dict:
    fe_dict[word] = np.mean((test_embedding_dict[word], mean_of_m_vectors[word]), axis = 0)

  return fe_dict

In [None]:
#also in this case we want to use the embeddings taken from the train dictionary

test_SSF_dict = test_sem_sim_features_vectors(test_sim_words, train_temp_dict, test_temp_dict)
len(test_SSF_dict)

4156

In [None]:
test_complete_dict = complete_dict(test_word_set, test_SSF_dict, glove_model, 300)
len(test_complete_dict)

5305

## Save Data for Classification

In [None]:
test_sim_words_df = pd.DataFrame.from_dict(test_sim_words, orient='index', columns = {'top 1': '0','top 2': '1','top 3': '2','top 4': '3','top 5': '4'})
test_sim_words_df

Unnamed: 0,top 1,top 2,top 3,top 4,top 5
familiar,familiar,seem,describe,look,vaguely
voice,voice,singing,tone,sound,message
pill,pill,medication,prescription,contraception,swallow
baseless,baseless,unfounded,untrue,accusation,allegation
pharmacy,grocery,nursing,dental,medicine,prescription
...,...,...,...,...,...
truck,truck,car,vehicle,pickup,tractor
third,third,second,fourth,sixth,first
ten,ten,eleven,fifteen,twelve,five
million,million,billion,worth,total,year


In [None]:
train_sim_words_df = pd.DataFrame.from_dict(train_sim_words, orient='index', columns = {'top 1': '0','top 2': '1','top 3': '2','top 4': '3','top 5': '4'})
train_sim_words_df

Unnamed: 0,top 1,top 2,top 3,top 4,top 5
honouring,honour,honored,honor,homage,remembering
baseless,unfounded,untrue,accusation,allegation,absurd
raptor,hornet,raptors,iguana,fighter,falcon
bear,lion,deer,wolf,dog,bearing
creamer,stroke,hole,pebble,champion,hubby
...,...,...,...,...,...
gladly,willingly,happily,accept,accepts,refuse
grim,bleak,reminder,depressing,dire,horrifying
draw,drawing,drawn,match,tie,win
million,billion,worth,total,year,compared


In [None]:
train_complete_df = pd.DataFrame.from_dict(train_complete_dict, orient='index')
train_complete_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
louisville,-0.373960,0.340770,-0.021993,0.407610,-0.470630,-0.067766,0.282410,-0.147510,0.288030,1.083000,...,0.651450,-0.381400,0.449990,0.573600,0.192920,-0.208090,0.391490,-0.293980,-0.163820,0.176160
honouring,0.025436,0.122731,-0.043138,0.044056,-0.321270,-0.149463,-0.549452,0.011406,-0.184334,0.267124,...,0.435608,-0.255318,0.170106,-0.172523,-0.170877,-0.140597,0.102598,0.581545,-0.058473,-0.214355
could,-0.256970,0.412040,-0.582720,-0.049361,0.019696,0.089635,0.399440,0.151160,0.180850,-2.136900,...,-0.142370,-0.195640,0.531510,0.006118,0.214170,-0.216500,0.114230,-0.709880,-0.201640,0.160570
baseless,0.675860,-0.055196,0.156391,0.149760,-0.054690,-0.628529,0.216436,0.001698,0.423069,-1.115823,...,0.050308,0.010147,0.450539,0.669224,-0.109531,-0.251300,0.120260,0.142289,-0.295863,0.088690
neovagina,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
grim,0.149732,-0.243040,0.515854,-0.100794,0.102823,0.288032,-0.304268,0.052088,-0.016341,-0.422969,...,0.103142,0.160995,0.202280,-0.036750,-0.447008,-0.050512,-0.003745,0.495457,-0.180325,0.230926
draw,0.273651,0.615417,-0.153476,-0.141123,0.072881,0.265140,-0.068609,0.039188,0.134277,-0.898858,...,-0.126936,0.331702,-0.266953,0.409664,0.530027,0.294174,0.380161,-0.353255,0.387008,0.002380
hotss,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
million,-0.620916,0.209009,-0.054027,0.326218,-0.225628,0.159345,0.065859,-0.058412,-0.084533,-1.221570,...,-0.325567,-0.277738,-0.006217,0.461534,0.650152,0.325898,-0.074898,-0.207049,-0.128028,-0.330199


In [None]:
test_complete_df = pd.DataFrame.from_dict(test_complete_dict, orient='index')
test_complete_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
could,-0.256970,0.412040,-0.582720,-0.049361,0.019696,0.089635,0.399440,0.151160,0.180850,-2.136900,...,-0.142370,-0.195640,0.531510,0.006118,0.214170,-0.216500,0.114230,-0.709880,-0.201640,0.160570
baseless,0.692551,-0.079869,0.162431,0.117978,-0.075649,-0.677570,0.206940,0.053295,0.442053,-1.133800,...,0.057553,0.022625,0.498823,0.701873,-0.185632,-0.174580,0.154580,0.124259,-0.315423,0.073930
bear,0.068850,0.097631,-0.288812,0.020543,0.323200,0.115069,-0.085692,0.850058,0.251678,-0.603147,...,-0.225936,-0.076832,0.105551,-0.335786,-0.403175,-0.171885,0.267568,0.463183,0.423455,0.034121
gross,-0.419633,0.477468,-0.410773,0.152195,-0.191657,0.197601,-0.134965,0.265710,-0.172300,-1.368160,...,0.117861,0.175750,-0.280109,0.038745,0.340979,0.559287,-0.300938,0.097927,0.105388,-0.246738
ritual,-0.029153,-0.025894,0.048573,-0.120038,0.400209,-0.336308,0.110570,0.577693,0.075454,-0.376871,...,0.116911,0.264364,-0.302693,0.574857,0.125676,-0.616527,-0.034374,0.252134,-0.255989,-0.283671
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
lost,0.189286,0.106654,-0.082377,0.232424,0.222617,-0.240532,0.017649,0.379905,0.006253,-0.823240,...,-0.008411,0.030913,-0.003890,-0.372891,0.354046,0.487082,-0.136673,-0.653571,0.058191,-0.154812
grim,0.145040,-0.236044,0.570572,-0.092211,0.109740,0.269039,-0.347720,0.069129,-0.031543,-0.464419,...,0.100151,0.157207,0.225876,-0.016527,-0.527188,0.029685,-0.020540,0.510886,-0.207847,0.241449
assigning,-0.119046,0.325337,0.391735,0.290078,-0.342803,0.044961,0.023723,0.207108,-0.008860,-0.513306,...,-0.035586,-0.419185,0.008715,-0.200635,0.134425,0.092395,-0.192952,0.185992,-0.060519,-0.270277
draw,0.294544,0.604461,-0.133480,-0.133282,0.109728,0.297985,-0.082762,0.031493,0.154846,-0.917424,...,-0.109082,0.354473,-0.270102,0.458183,0.564186,0.310479,0.418074,-0.320548,0.446479,-0.005420


In [None]:
train_sim_words_df.to_csv('/content/drive/MyDrive/TEXT MINING/Data/300_train_top_5_similarities.csv', index=True)

test_sim_words_df.to_csv('/content/drive/MyDrive/TEXT MINING/Data/300_test_top_5_similarities.csv', index=True)

In [None]:
train_complete_df.to_csv('/content/drive/MyDrive/TEXT MINING/Data/300_train_complete_embeddings.csv', index=True)

test_complete_df.to_csv('/content/drive/MyDrive/TEXT MINING/Data/300_test_complete_embeddings.csv', index=True)

# Clustering

In [15]:
# Total unique words inside all the collection

cluster_word_set = extract_tokens(df['document_tokens'])
len(cluster_word_set)

13161

In [16]:
cluster_emo_set = emotion_set.intersection(cluster_word_set)
len(cluster_emo_set)

9446

In [17]:
cluster_temp_dict = embedding_dict(cluster_emo_set, glove_model, 300)
len(cluster_temp_dict)

9446

In [18]:
cluster_embedding_matrix = np.array(list(cluster_temp_dict.values()))
cluster_embedding_matrix.shape

(9446, 300)

In [19]:
cluster_sim_matrix = cosine_similarity(cluster_embedding_matrix)
cluster_sim_matrix.shape

(9446, 9446)

In [20]:
cluster_sim_words = m_sim_words(list(cluster_temp_dict.keys()), cluster_sim_matrix, 5)

In [21]:
len(cluster_sim_words)

9446

In [None]:
cluster_SSF_dict = semantically_similar_features_vectors(cluster_sim_words, cluster_temp_dict)

cluster_complete_dict = complete_dict(cluster_word_set, cluster_SSF_dict, glove_model, 300)
len(cluster_complete_dict)

13161

### Save data for Clustering

In [None]:
cluster_complete_df = pd.DataFrame.from_dict(cluster_complete_dict, orient='index')
cluster_complete_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
louisville,-0.373960,0.340770,-0.021993,0.407610,-0.470630,-0.067766,0.282410,-0.147510,0.288030,1.083000,...,0.651450,-0.381400,0.449990,0.573600,0.192920,-0.208090,0.391490,-0.293980,-0.163820,0.176160
honouring,0.025436,0.122731,-0.043138,0.044056,-0.321270,-0.149463,-0.549452,0.011406,-0.184334,0.267124,...,0.435608,-0.255318,0.170106,-0.172523,-0.170877,-0.140597,0.102598,0.581545,-0.058473,-0.214355
could,-0.256970,0.412040,-0.582720,-0.049361,0.019696,0.089635,0.399440,0.151160,0.180850,-2.136900,...,-0.142370,-0.195640,0.531510,0.006118,0.214170,-0.216500,0.114230,-0.709880,-0.201640,0.160570
baseless,0.675860,-0.055196,0.156391,0.149760,-0.054690,-0.628529,0.216436,0.001698,0.423069,-1.115823,...,0.050308,0.010147,0.450539,0.669224,-0.109531,-0.251300,0.120260,0.142289,-0.295863,0.088690
neovagina,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
assigning,-0.119046,0.325337,0.391735,0.290078,-0.342803,0.044961,0.023723,0.207108,-0.008860,-0.513306,...,-0.035586,-0.419185,0.008715,-0.200635,0.134425,0.092395,-0.192952,0.185992,-0.060519,-0.270277
draw,0.273651,0.615417,-0.153476,-0.141123,0.072881,0.265140,-0.068609,0.039188,0.134277,-0.898858,...,-0.126936,0.331702,-0.266953,0.409664,0.530027,0.294174,0.380161,-0.353255,0.387008,0.002380
hotss,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
million,-0.620916,0.209009,-0.054027,0.326218,-0.225628,0.159345,0.065859,-0.058412,-0.084533,-1.221570,...,-0.325567,-0.277738,-0.006217,0.461534,0.650152,0.325898,-0.074898,-0.207049,-0.128028,-0.330199


In [22]:
cluster_sim_words_df = pd.DataFrame.from_dict(cluster_sim_words, orient='index', columns = {'top 1': '0','top 2': '1','top 3': '2','top 4': '3','top 5': '4'})
cluster_sim_words_df

Unnamed: 0,top 1,top 2,top 3,top 4,top 5
halftime,intermission,lead,scoring,minute,scored
workplace,harassment,discrimination,bullying,employee,gender
hippo,alligator,lizard,ape,zebra,coyote
determine,determined,whether,determines,evaluate,decide
shred,slightest,discard,whatsoever,carcass,leftover
...,...,...,...,...,...
opposed,oppose,supported,strongly,favor,arguing
coast,ocean,southern,caribbean,sea,island
spun,spin,rolled,bounced,lap,wheel
fry,cook,bacon,stir,fried,onion


In [None]:
cluster_complete_df.to_csv('/content/drive/MyDrive/TEXT MINING/Data/300_cluster_complete_embeddings.csv', index=True)

In [23]:
cluster_sim_words_df.to_csv('/content/drive/MyDrive/TEXT MINING/Data/300_cluster_top_5_similarities.csv', index=True)