In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.linalg import norm
import re
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
import spacy

In [22]:
#read in the data
twitter_df = pd.read_csv('/content/Twitter_Data.csv')
twitter_df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [23]:
#check for NaN
twitter_df.isna().sum()

Unnamed: 0,0
clean_text,4
category,7


In [25]:
#fill NaN with empty string and 0
twitter_df['clean_text'] = twitter_df['clean_text'].fillna('')
twitter_df['category'] = twitter_df['category'].fillna(0)

#verify there are no more NaN
print(twitter_df.isna().sum())
twitter_df.head()

clean_text    0
category      0
dtype: int64


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


### Find the cosine similarity in clean_text for two tweets of the 100th and 10,000th tweets using dot and norm function.

In [31]:
#TfidVectorizer
vectorizer = TfidfVectorizer(use_idf=True,smooth_idf=True, sublinear_tf=False)

#grab 100th and 10,000th record
tweet1 = twitter_df['clean_text'][99]
tweet2 = twitter_df['clean_text'][9999]

#perform tfid vectorizer on both
vectorizer.fit([tweet1, tweet2])
tf_idf_tweet1 = vectorizer.transform([tweet1])
tf_idf_tweet2 = vectorizer.transform([tweet2])

#compute cosine similarity based
cos_sim = np.dot(tf_idf_tweet1, tf_idf_tweet2.T)/(norm(tf_idf_tweet1)*norm(tf_idf_tweet2))
print(cos_sim)

  (0, 0)	0.058343251997229464


### Find the cosine similarity in clean_text for two tweets of  the 100th and 10,000th tweets using the cosine function.

In [35]:
#TfidVectorizer
vectorizer = TfidfVectorizer(use_idf=True,smooth_idf=True, sublinear_tf=False)

#grab 100th and 10,000th record
tweet1 = twitter_df['clean_text'][99]
tweet2 = twitter_df['clean_text'][9999]

#perform tfid vectorizer on both
vectorizer.fit([tweet1, tweet2])
tf_idf_tweet1 = vectorizer.transform([tweet1])
tf_idf_tweet2 = vectorizer.transform([tweet2])

#Compute cosine
cos_sim = 1 - spatial.distance.cosine(tf_idf_tweet1.toarray().ravel(), tf_idf_tweet2.toarray().ravel())
print(cos_sim)

0.05834325199722945


### Find the cosine similarity in clean_text for two tweets of  the 100th and 10,000th tweets using cosine_similarity function.

In [36]:
#TfidVectorizer
vectorizer = TfidfVectorizer(use_idf=True,smooth_idf=True, sublinear_tf=False)

#grab 100th and 10,000th record
tweet1 = twitter_df['clean_text'][99]
tweet2 = twitter_df['clean_text'][9999]

#perform tfid vectorizer on both
vectorizer.fit([tweet1, tweet2])
tf_idf_tweet1 = vectorizer.transform([tweet1])
tf_idf_tweet2 = vectorizer.transform([tweet2])

#calculate cosine similarity
cos_sim = cosine_similarity(tf_idf_tweet1, tf_idf_tweet2, dense_output=True)
print(cos_sim)

[[0.05834325]]


Find the cosine similarity in clean_text for two tweets of  the 100th and 10,000th tweets using the Spacy function.

In [40]:
!python -m spacy download en_core_web_lg # Download the required SpaCy model

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [41]:
#TfidVectorizer
vectorizer = TfidfVectorizer(use_idf=True,smooth_idf=True, sublinear_tf=False)

#grab 100th and 10,000th record
tweet1 = twitter_df['clean_text'][99]
tweet2 = twitter_df['clean_text'][9999]

#Use Spacy function
nlp = spacy.load("en_core_web_lg")
cos_sim = nlp(tweet1).similarity(nlp(tweet2))
print(cos_sim)

0.8322563581908424


Find the tweets with the cosine similarity > 60% with the 100th tweets using Spacy in this dataset.

In [44]:
vector = nlp(twitter_df['clean_text'][99]).vector

similar_words = []

#find similar words whose vector is close to our vector
for word in nlp.vocab:
  if word.has_vector and re.search('[a-z]{2,}', word.text):
    similarity = cosine_similarity(vector.reshape(1, -1), word.vector.reshape(1, -1))
    similar_words.append((word, similarity))
for idx, pairs in enumerate(similar_words):
  word, similarity = pairs
  if similarity > 0.6:
    print(idx, word, similarity)

38 <spacy.lexeme.Lexeme object at 0x7e858a3f8d40> [[0.6040245]]
48 <spacy.lexeme.Lexeme object at 0x7e85588b6cc0> [[0.62626684]]
105 <spacy.lexeme.Lexeme object at 0x7e85588b6c40> [[0.61101514]]
107 <spacy.lexeme.Lexeme object at 0x7e85588b4a00> [[0.60191256]]
121 <spacy.lexeme.Lexeme object at 0x7e858726e0c0> [[0.6082063]]
139 <spacy.lexeme.Lexeme object at 0x7e858726ca00> [[0.6272931]]
148 <spacy.lexeme.Lexeme object at 0x7e858726eb80> [[0.67127496]]
155 <spacy.lexeme.Lexeme object at 0x7e858726d040> [[0.66235405]]
165 <spacy.lexeme.Lexeme object at 0x7e858726d800> [[0.619812]]
174 <spacy.lexeme.Lexeme object at 0x7e858726f900> [[0.6235994]]
214 <spacy.lexeme.Lexeme object at 0x7e858726da40> [[0.6103208]]
240 <spacy.lexeme.Lexeme object at 0x7e858726ea40> [[0.6571302]]
251 <spacy.lexeme.Lexeme object at 0x7e85589a9d40> [[0.6268065]]
294 <spacy.lexeme.Lexeme object at 0x7e85589abd00> [[0.6974189]]
304 <spacy.lexeme.Lexeme object at 0x7e85589aab80> [[0.6135072]]
306 <spacy.lexeme.Lexem

### Compute the corpus vector that is equal to the average of all the document vectors, where each document corresponds to a tweet or a row in this dataset.

In [54]:
import tqdm
doc_vectors = []
for i in tqdm.tqdm(range(len(twitter_df['clean_text']))):
  text = twitter_df['clean_text'][i]
  doc = nlp(text)
  doc_vectors.append(doc.vector)
avg_word_vector = np.mean(doc_vectors, axis=0)
print(avg_word_vector)

100%|██████████| 162980/162980 [21:54<00:00, 123.94it/s]


[-7.5881380e-01  9.0608537e-01 -1.8319457e+00 -1.9740669e-01
  2.2365820e+00  6.1380923e-01  8.2553822e-01  2.9824421e+00
 -1.2589004e+00 -1.8940932e-01  4.2881613e+00  1.3459404e+00
 -3.3273089e+00  1.1568143e+00  7.7275485e-01  7.7758324e-01
  1.2772434e+00 -8.2139164e-01 -7.2237653e-01 -8.3717847e-01
  3.0457485e-01 -3.6387262e-01 -1.0821891e+00 -1.1372043e+00
 -3.8103822e-01 -7.8664941e-01 -1.4165604e+00 -7.5406247e-01
 -3.3157209e-01  9.4050115e-01  1.5680196e+00 -3.7004036e-01
 -5.6055778e-01 -1.7478343e+00 -2.9632697e-01 -7.0787184e-02
 -4.0696451e-01  1.2013572e+00  1.5808424e+00  9.7265393e-01
 -2.3418619e-01  3.8704985e-01  5.2086413e-01  9.3009390e-02
 -1.2111317e+00  1.0262944e+00  1.0746666e+00 -2.3217840e+00
 -6.2234807e-01  1.5607746e+00 -2.4529074e-01  1.4316698e+00
  3.0550948e-01 -3.4312570e+00 -1.2313048e+00  3.3174008e-01
  1.5690988e-01  1.0550045e+00  9.5876926e-01  3.2244831e-01
  1.6847293e+00 -8.8617891e-02 -6.3647342e-01 -1.6926929e+00
  1.5578955e+00  1.22995