In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

data1 = "I'm designing a document and don't want to get bogged down in what the text actually says"
data2 = "I'm creating a template for various paragraph styles and need to see what they will look like."
data3 = "I'm trying to learn more about some features of Microsoft Word and don't want to practice on a real document"

df1 = pd.DataFrame({'First_Para':[data1], 'Second_Para':[data2], 'Third_Para':[data3]})
vectorizer = TfidfVectorizer()
doc_vec = vectorizer.fit_transform(df1.iloc[0])

df2 = pd.DataFrame(doc_vec.toarray().transpose(),index=vectorizer.get_feature_names_out())

df2.columns = df1.columns
print(df2)

           First_Para  Second_Para  Third_Para
about        0.000000     0.000000    0.254170
actually     0.288540     0.000000    0.000000
and          0.170416     0.162095    0.150117
bogged       0.288540     0.000000    0.000000
creating     0.000000     0.274451    0.000000
designing    0.288540     0.000000    0.000000
document     0.219442     0.000000    0.193303
don          0.219442     0.000000    0.193303
down         0.288540     0.000000    0.000000
features     0.000000     0.000000    0.254170
for          0.000000     0.274451    0.000000
get          0.288540     0.000000    0.000000
in           0.288540     0.000000    0.000000
learn        0.000000     0.000000    0.254170
like         0.000000     0.274451    0.000000
look         0.000000     0.274451    0.000000
microsoft    0.000000     0.000000    0.254170
more         0.000000     0.000000    0.254170
need         0.000000     0.274451    0.000000
of           0.000000     0.000000    0.254170
on           

## BM25

In [3]:
from rank_bm25 import BM25Okapi

corpus=[
    "I will take the ring, though I do not know the way.",
    "I will help you bear this burden, Frodo Baggins, as long as it is yours to bear",
    "If by my life or death I can protect you, I will."
]

tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

In [4]:
query="I will take"
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
print(doc_scores)

[0.79842666 0.18266174 0.15070007]


## LDA

In [6]:
# Importing modules
import pandas as pd
# Read Corona Tweets
tweets = pd.read_csv(r"Corona_NLP_train.csv", encoding='latin-1')
# Print head
tweets.head()

Unnamed: 0,ï»¿UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [7]:
# Remove the columns
tweets = tweets.drop(columns=['ï»¿UserName', 'ScreenName', 'Location','TweetAt','Sentiment'], axis=1).sample(100)
# Print out the first rows of papers
tweets.head()

Unnamed: 0,OriginalTweet
11410,@realDonaldTrump #coronavirus REALLY? You tel ...
38892,Dm for info regarding deliveries. We are extre...
20962,Whenever you touch any object or surface outsi...
24403,The pain of the pandemic has the Seattle-born ...
31609,They are doctors nurses disability care worker...


In [9]:
# Load the regular expression library
import re
# Remove punctuation
tweets['OriginalTweet_processed'] = \
tweets['OriginalTweet'].map(lambda x: re.sub('[@#,\\.!?]', '', x))
# Convert the tweets to lowercase
tweets['OriginalTweet_processed'] = \
tweets['OriginalTweet_processed'].map(lambda x: x.lower())
# Print out the first rows of tweets
tweets['OriginalTweet_processed'].head()

11410    realdonaldtrump coronavirus really you tel gov...
38892    dm for info regarding deliveries we are extrem...
20962    whenever you touch any object or surface outsi...
24403    the pain of the pandemic has the seattle-born ...
31609    they are doctors nurses disability care worker...
Name: OriginalTweet_processed, dtype: object

In [10]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['https', 'tco'])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
data = tweets.OriginalTweet_processed.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['realdonaldtrump', 'coronavirus', 'really', 'tel', 'gova', 'source', 'thenoutbid', 'prices', 'always', 'component', 'also', 'maybe', 'thata', 'lost', 'feds', 'ok', 'thata', 'probably', 'whya', 'trump', 'said', 'people', 'dying', 'piece', 'shit']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pineapple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 2), (21, 1), (22, 1), (23, 1)]


In [12]:
from pprint import pprint
# number of topics
num_topics = 7
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.011*"coronavirus" + 0.009*"covid" + 0.009*"prices" + 0.008*"pandemic" + '
  '0.007*"trip" + 0.007*"store" + 0.006*"supermarket" + 0.006*"consumer" + '
  '0.006*"lower" + 0.006*"via"'),
 (1,
  '0.025*"coronavirus" + 0.024*"covid" + 0.013*"grocery" + 0.012*"supermarket" '
  '+ 0.010*"food" + 0.010*"stores" + 0.010*"people" + 0.008*"keep" + '
  '0.008*"workers" + 0.008*"know"'),
 (2,
  '0.025*"coronavirus" + 0.019*"covid" + 0.015*"store" + 0.014*"grocery" + '
  '0.012*"amp" + 0.010*"go" + 0.009*"people" + 0.009*"uk" + '
  '0.006*"supermarket" + 0.006*"get"'),
 (3,
  '0.025*"covid" + 0.021*"coronavirus" + 0.011*"shopping" + 0.009*"prices" + '
  '0.008*"grocery" + 0.006*"consumer" + 0.006*"store" + 0.006*"food" + '
  '0.006*"supermarket" + 0.006*"due"'),
 (4,
  '0.011*"coronavirus" + 0.009*"covid" + 0.007*"stores" + 0.007*"food" + '
  '0.007*"sanitizer" + 0.007*"shortage" + 0.006*"dona" + 0.006*"prices" + '
  '0.006*"old" + 0.005*"people"'),
 (5,
  '0.012*"covid" + 0.010*"financia

In [14]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(num_topics))
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)

LDAvis_prepared

  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


## LSA

In [16]:
import os.path
from gensim import corpora
from gensim.models import LsiModel
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from gensim.models.coherencemodel import CoherenceModel
import matplotlib.pyplot as plt

Matplotlib is building the font cache; this may take a moment.


In [17]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['https', 'tco'])
def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]
def stem_words(texts):
    return [[p_stemmer.stem(word) for word in simple_preprocess(str(doc))
             ] for doc in texts]

data = tweets.OriginalTweet_processed.values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['realdonaldtrump', 'coronavirus', 'really', 'tel', 'gova', 'source', 'thenoutbid', 'prices', 'always', 'component', 'also', 'maybe', 'thata', 'lost', 'feds', 'ok', 'thata', 'probably', 'whya', 'trump', 'said', 'people', 'dying', 'piece', 'shit']


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pineapple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
doc_term_matrix = [id2word.doc2bow(twt) for twt in data_words]

In [19]:
lsa_model = LsiModel(doc_term_matrix, num_topics=num_topics, id2word = id2word)
print(lsa_model.print_topics(num_topics=num_topics, num_words=10))

[(0, '0.543*"coronavirus" + 0.501*"covid" + 0.242*"grocery" + 0.172*"store" + 0.168*"amp" + 0.148*"supermarket" + 0.123*"prices" + 0.119*"people" + 0.113*"food" + 0.113*"shopping"'), (1, '-0.578*"amp" + 0.362*"covid" + -0.290*"grocery" + -0.186*"store" + 0.160*"supermarket" + -0.147*"delivery" + 0.127*"prices" + -0.109*"pickup" + 0.107*"food" + -0.103*"free"'), (2, '0.474*"covid" + -0.397*"coronavirus" + 0.347*"amp" + 0.168*"delivery" + -0.158*"grocery" + 0.141*"online" + -0.118*"people" + -0.109*"go" + -0.090*"stores" + -0.088*"food"'), (3, '-0.385*"grocery" + 0.320*"amp" + -0.273*"store" + 0.218*"supermarket" + -0.182*"workers" + 0.164*"people" + -0.154*"heroes" + 0.154*"go" + -0.150*"shopping" + 0.134*"coronavirus"'), (4, '0.289*"supermarket" + -0.271*"coronavirus" + 0.217*"trip" + 0.217*"exposure" + -0.212*"go" + 0.210*"care" + 0.204*"workers" + 0.199*"food" + 0.173*"people" + -0.130*"data"'), (5, '-0.342*"trip" + -0.342*"exposure" + -0.247*"go" + 0.238*"shut" + 0.220*"prices" + 0.

  sparsetools.csc_matvecs(


## NMF

In [20]:
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
 
tweets = pd.read_csv(r"Corona_NLP_train.csv", encoding='latin-1')
# Print head
tweets.head()

Unnamed: 0,ï»¿UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,3799,48751,London,16-03-2020,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,3800,48752,UK,16-03-2020,advice Talk to your neighbours family to excha...,Positive
2,3801,48753,Vagabonds,16-03-2020,Coronavirus Australia: Woolworths to give elde...,Positive
3,3802,48754,,16-03-2020,My food stock is not the only one which is emp...,Positive
4,3803,48755,,16-03-2020,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [21]:
# Remove the columns
tweets = tweets.drop(columns=['ï»¿UserName', 'ScreenName', 'Location','TweetAt','Sentiment'], axis=1).sample(100)
# Print out the first rows of papers
tweets.head()

Unnamed: 0,OriginalTweet
16244,How was it we said PANDEMY and you understood ...
7325,To all those who continue to panic buy food at...
12362,GO CONTACTLESS When you can If the Supermarket...
30036,AgFunder: A quick look at the impact of Covid-...
2443,"Amid the #COVID19 outbreak, MinnesotaÂs Fortu..."


In [22]:
# use tfidf by removing tokens that don't appear in at least 50 documents
vect = TfidfVectorizer(min_df=10, stop_words=stop_words )
 
# Fit and transform
X = vect.fit_transform(tweets.OriginalTweet)

In [23]:
# Create an NMF instance: model
# the 10 components will be the topics
model = NMF(n_components=10, random_state=5)
 
# Fit the model to TF-IDF
model.fit(X)
 
# Transform the TF-IDF: nmf_features
nmf_features = model.transform(X)

In [24]:
components_df = pd.DataFrame(model.components_, columns=vect.get_feature_names_out())
components_df

Unnamed: 0,19,co,consumer,coronavirus,covid,covid19,covid_19,food,get,grocery,prices,store,supermarket,work
0,0.0,10.177027,0.002321,0.0,0.093572,0.0,0.525682,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2.507881,0.010706,0.0,0.0,2.427836,0.0,0.0,0.042228,0.0,0.0,0.00197,0.098949,0.0,0.112721
2,0.04987,0.0,0.0,0.0,0.0,0.0,0.020648,0.0,0.023174,1.835004,0.0,1.973962,0.0,0.0
3,0.081841,0.0,0.0,0.000563,0.0,0.0,0.127647,0.0,0.112686,0.0,0.0,0.0,2.961793,0.0
4,0.0,0.0,0.0,0.002429,0.003636,0.0,0.20696,0.0,0.120772,0.0,2.809617,0.0,0.0,0.0
5,0.0,0.007911,0.0,3.295499,0.0,0.0,0.0,0.021892,0.0,0.110849,0.0,0.0,0.0,0.0
6,0.113752,0.001957,3.346553,0.000785,0.0,0.001444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.001481,0.0,0.853044,1.845793,0.040871,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.006016,0.0,0.0,0.0,0.916713,0.0,0.0,0.013412,0.0,1.15834
9,0.0,0.018383,0.000181,0.0,0.0,1.54347,0.0,0.038964,0.055644,0.064211,0.000796,0.0,0.0,0.0


In [25]:
for topic in range(components_df.shape[0]):
    tmp = components_df.iloc[topic]
    print(f'For topic {topic+1} the words with the highest value are:')
    print(tmp.nlargest(10))
    print('\n')

For topic 1 the words with the highest value are:
co             10.177027
covid_19        0.525682
covid           0.093572
consumer        0.002321
19              0.000000
coronavirus     0.000000
covid19         0.000000
food            0.000000
get             0.000000
grocery         0.000000
Name: 0, dtype: float64


For topic 2 the words with the highest value are:
19             2.507881
covid          2.427836
work           0.112721
store          0.098949
food           0.042228
co             0.010706
prices         0.001970
consumer       0.000000
coronavirus    0.000000
covid19        0.000000
Name: 1, dtype: float64


For topic 3 the words with the highest value are:
store          1.973962
grocery        1.835004
19             0.049870
get            0.023174
covid_19       0.020648
co             0.000000
consumer       0.000000
coronavirus    0.000000
covid          0.000000
covid19        0.000000
Name: 2, dtype: float64


For topic 4 the words with the highest val

## PCA

In [26]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['https','tco'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pineapple/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
vect= TfidfVectorizer(stop_words=stop_words)
x = vect.fit_transform(tweets.OriginalTweet)
tf_idf_vect = pd.DataFrame(x.toarray().transpose(),index=vect.get_feature_names_out())


In [28]:
tf_idf_vect

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
100,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
11,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
13,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.261114,0.0,0.0,0.0,0.0
16nh5wuzox,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.224157,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
youâ,0.0,0.246301,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
zero,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
ziieacwfog,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
zolnefkhxz,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0


In [29]:
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
pca.fit_transform(tf_idf_vect)

array([[-8.74238154e-03,  8.17141958e-03, -5.20440595e-03, ...,
        -1.16529795e-02,  3.36458466e-03,  3.43007099e-03],
       [-2.65201210e-02, -1.18381132e-02,  1.98217256e-02, ...,
        -6.99519554e-03, -1.33950788e-02, -1.65854994e-02],
       [-1.19882772e-02,  1.04809707e-02,  6.06281207e-03, ...,
         1.93271228e-02, -2.73434053e-02, -4.22264800e-02],
       ...,
       [ 2.71474338e-03,  2.12703094e-02, -6.44129440e-02, ...,
        -7.25817601e-02, -7.63194029e-02, -5.50823439e-02],
       [-3.31332653e-03,  1.46155612e-02, -4.92467412e-02, ...,
        -7.10570396e-03, -5.27280246e-03, -5.96378975e-03],
       [ 4.44965480e-02, -4.30826356e-02,  1.03095153e-02, ...,
        -1.66989088e-02,  3.82950078e-02, -8.87909650e-05]])

In [30]:
print(pca.components_)

[[-0.03703872  0.01700879  0.00674683 ... -0.01100549 -0.06096137
   0.04743106]
 [ 0.00066016 -0.09439248 -0.06130642 ... -0.01654728 -0.02264427
   0.03357699]
 [-0.0308215   0.09210414  0.06214952 ... -0.04229609  0.01084377
  -0.1572611 ]
 ...
 [ 0.13817685  0.06807946 -0.0016724  ... -0.01163301 -0.0072622
   0.02903595]
 [ 0.04503803  0.03013976  0.00981831 ...  0.06604964  0.10178815
  -0.02477647]
 [ 0.09573981  0.09297686 -0.22141623 ... -0.01881744  0.07742084
  -0.00486171]]


## SVD

In [31]:
import numpy as np

In [32]:
vect= TfidfVectorizer(stop_words=stop_words, smooth_idf=True)
x = vect.fit_transform(tweets.OriginalTweet).todense()
x = np.asarray(x)

In [33]:
from sklearn.decomposition import TruncatedSVD
svd_modeling = TruncatedSVD(n_components=4, algorithm='randomized', n_iter=100, random_state=122)
svd_modeling.fit(x)
components=svd_modeling.components_
vocab = vect.get_feature_names_out()

In [34]:
topic_word_list = []
def get_topics(components):
    for i, comp in enumerate(components):
        terms_comp =zip(vocab,comp)
        sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:7]
        topic=" "
        for t in sorted_terms:
            topic= topic + ' ' + t[0]
        topic_word_list.append(topic)
        print(topic_word_list)
    return topic_word_list
get_topics(components)
        

['  co coronavirus store 19 grocery covid food']
['  co coronavirus store 19 grocery covid food', '  store grocery food time panic supermarkets go']
['  co coronavirus store 19 grocery covid food', '  store grocery food time panic supermarkets go', '  19 covid store grocery consumer behavior shifts']
['  co coronavirus store 19 grocery covid food', '  store grocery food time panic supermarkets go', '  19 covid store grocery consumer behavior shifts', '  co coronavirus grocery stores store prices bueqsjzc5l']


['  co coronavirus store 19 grocery covid food',
 '  store grocery food time panic supermarkets go',
 '  19 covid store grocery consumer behavior shifts',
 '  co coronavirus grocery stores store prices bueqsjzc5l']