In [1]:
import gensim.downloader as api
# Gensim is widely used in industry and academia for text analysis and modeling applications.


In [2]:
wv = api.load("word2vec-google-news-300")
# Word2Vec-Google-News-300 is a pre-trained word embedding model developed by Google.
# It was trained on a large corpus of text data (Google News) and represents words as 300-dimensional vectors. 
# These vectors capture the semantic meaning of the words and can be used for various NLP tasks such as text classification, 
# similarity analysis, and clustering. The "300" in the name represents the size of the word vectors, which is 300 dimensions.
# The model is widely used as a starting point for various NLP tasks, and its pre-trained vectors can save a lot of time and 
# computational resources compared to training a word embedding model from scratch.
# Huge model 1.6 GB and trained on 100 billion words.



In [3]:
wv.similarity(w1="great", w2="great")

1.0

In [4]:
wv.similarity(w1="great", w2="good")

0.72915095

In [5]:
wv.most_similar("good") # SImilar words

[('great', 0.7291510105133057),
 ('bad', 0.7190051078796387),
 ('terrific', 0.6889115571975708),
 ('decent', 0.6837348937988281),
 ('nice', 0.6836092472076416),
 ('excellent', 0.644292950630188),
 ('fantastic', 0.6407778263092041),
 ('better', 0.6120728254318237),
 ('solid', 0.5806034803390503),
 ('lousy', 0.5764201879501343)]

In [6]:
wv.similarity(w1="good", w2="great")

0.72915095

In [7]:
wv.most_similar("delectable") # SImilar words

[('delicious', 0.8363204002380371),
 ('scrumptious', 0.8109372854232788),
 ('tasty', 0.7385421991348267),
 ('yummy', 0.7123230695724487),
 ('delightful', 0.6968866586685181),
 ('sumptuous', 0.6954765319824219),
 ('luscious', 0.6944749355316162),
 ('delectable_desserts', 0.686908483505249),
 ('palate_pleasing', 0.6720318794250488),
 ('lip_smacking', 0.6578160524368286)]

In [8]:
wv.most_similar("dog")

[('dogs', 0.8680489659309387),
 ('puppy', 0.8106428384780884),
 ('pit_bull', 0.780396044254303),
 ('pooch', 0.7627377510070801),
 ('cat', 0.7609456777572632),
 ('golden_retriever', 0.7500902414321899),
 ('German_shepherd', 0.7465174198150635),
 ('Rottweiler', 0.7437614798545837),
 ('beagle', 0.7418621778488159),
 ('pup', 0.740691065788269)]

In [9]:
wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=5)

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133)]

In [10]:
wv.most_similar(positive=['france', 'berlin'], negative=['paris'], topn=5)

[('germany', 0.5094344019889832),
 ('european', 0.4865044951438904),
 ('german', 0.4714890718460083),
 ('austria', 0.46964019536972046),
 ('swedish', 0.46451830863952637)]

In [11]:
wv.doesnt_match(["facebook", "cat", "google", "microsoft"])

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'cat'

In [12]:
wv.doesnt_match(["dog", "cat", "google", "mouse"])

'google'

In [13]:
wv.doesnt_match(["tomato", "banana", "peach", "apple","pear"])

'banana'

In [14]:
wv.doesnt_match(["spectre", "apparition", "hoodlum", "phantom"])



'hoodlum'

In [15]:
wv.doesnt_match(["listen","swim","walk","climb"])

'listen'

Gensim: Glove

In [16]:
glv = api.load("glove-twitter-25")
 # similar to google-news-300



In [17]:
glv.most_similar("good")

[('too', 0.9648016095161438),
 ('day', 0.9533665180206299),
 ('well', 0.9503172039985657),
 ('nice', 0.9438973069190979),
 ('better', 0.9425961375236511),
 ('fun', 0.9418926239013672),
 ('much', 0.9413353204727173),
 ('this', 0.9387556314468384),
 ('hope', 0.9383507370948792),
 ('great', 0.9378515481948853)]

In [18]:
glv.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'

In [19]:
glv.doesnt_match("facebook cat google microsoft".split())

'cat'

In [20]:
glv.doesnt_match("banana grapes orange human".split())

'human'

Text Classification USing Gensim Word Embeddings

In [21]:
import pandas as pd


#read the dataset with name "Fake_Real_Data.csv" and store it in a variable df
path = "/content/drive/MyDrive/train_fakenews.csv"

df = pd.read_csv(path)

#print the shape of dataframe
print(df.shape)

#print top 5 rows
df.head(5)

(20800, 5)


Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [22]:
df.drop(['title','author','id'],axis = 1,inplace=True)
df

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1
...,...,...
20795,Rapper T. I. unloaded on black celebrities who...,0
20796,When the Green Bay Packers lost to the Washing...,0
20797,The Macy’s of today grew from the union of sev...,0
20798,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [23]:
#check the distribution of labels 
df['label'].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [24]:
df.text[1]
# 0-->True and 1-->fake

'Ever get the feeling your life circles the roundabout rather than heads in a straight line toward the intended destination? [Hillary Clinton remains the big woman on campus in leafy, liberal Wellesley, Massachusetts. Everywhere else votes her most likely to don her inauguration dress for the remainder of her days the way Miss Havisham forever wore that wedding dress.  Speaking of Great Expectations, Hillary Rodham overflowed with them 48 years ago when she first addressed a Wellesley graduating class. The president of the college informed those gathered in 1969 that the students needed “no debate so far as I could ascertain as to who their spokesman was to be” (kind of the like the Democratic primaries in 2016 minus the   terms unknown then even at a Seven Sisters school). “I am very glad that Miss Adams made it clear that what I am speaking for today is all of us —  the 400 of us,” Miss Rodham told her classmates. After appointing herself Edger Bergen to the Charlie McCarthys and Mor

In [25]:
import numpy as np

In [26]:
# Now we will convert the text into a vector using gensim's word2vec embeddings.
# We will do this in three steps,
# Preprocess the text to remove stop words, punctuations and get lemma for each word
# Get word vectors for each of the words in a pre-processed sentece
# Take a mean of all word vectors to derive the numeric representation of the entire news article
# First let's explore get_mean_vector api of gensim to see how it works

In [27]:
wv.similarity(w1="great", w2="good")

0.72915095

In [28]:
wv_great = wv["great"]
wv_good = wv["good"]

In [29]:
wv_great.shape, wv_good.shape

((300,), (300,))

In [30]:

r1 = np.mean([wv_good, wv_great],axis=0)

In [31]:
wv_good[:5]

array([ 0.04052734,  0.0625    , -0.01745605,  0.07861328,  0.03271484],
      dtype=float32)

In [32]:
wv_great[:5]

array([ 0.07177734,  0.20800781, -0.02844238,  0.17871094,  0.1328125 ],
      dtype=float32)

In [33]:
import spacy
nlp = spacy.load("en_core_web_lg") # if this fails then run "python -m spacy download en_core_web_lg" to download that model

def preprocess_and_vectorize(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return filtered_tokens

OSError: ignored

In [None]:
!python -m spacy download en_core_web_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m587.7/587.7 MB[0m [31m124.3 MB/s[0m eta [36m0:00:01[0m

In [None]:
preprocess_and_vectorize("Don't worry if you don't understand")

In [None]:
# worry -> vector of size 300
# understand -> vector of size 300

# average = []{300}->sentense embedding

In [None]:
#!python -m spacy download en_core_web_lg

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg") # if this fails then run "python -m spacy download en_core_web_lg" to download that model

def preprocess_and_vectorize(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return wv.get_mean_vector(filtered_tokens)

In [None]:
v = preprocess_and_vectorize("Don't worry if you don't understand")
v.shape

In [None]:
#this query takes few minutes, so go get some walk :)

df['vector'] = df['Text'].apply(lambda text: preprocess_and_vectorize(text))

In [None]:
from sklearn.model_selection import train_test_split


#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values, 
    df.label_num, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.label_num
)

In [None]:
print("Shape of X_train before reshaping: ", X_train.shape)
print("Shape of X_test before reshaping: ", X_test.shape)


X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

print("Shape of X_train after reshaping: ", X_train_2d.shape)
print("Shape of X_test after reshaping: ", X_test_2d.shape)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

#1. creating a GradientBoosting model object
clf = GradientBoostingClassifier()

#2. fit with all_train_embeddings and y_train
clf.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

In [None]:
test_news = [
    "Michigan governor denies misleading U.S. House on Flint water (Reuters) - Michigan Governor Rick Snyder denied Thursday that he had misled a U.S. House of Representatives committee last year over testimony on Flintâ€™s water crisis after lawmakers asked if his testimony had been contradicted by a witness in a court hearing. The House Oversight and Government Reform Committee wrote Snyder earlier Thursday asking him about published reports that one of his aides, Harvey Hollins, testified in a court hearing last week in Michigan that he had notified Snyder of an outbreak of Legionnairesâ€™ disease linked to the Flint water crisis in December 2015, rather than 2016 as Snyder had testified. â€œMy testimony was truthful and I stand by it,â€ Snyder told the committee in a letter, adding that his office has provided tens of thousands of pages of records to the committee and would continue to cooperate fully.  Last week, prosecutors in Michigan said Dr. Eden Wells, the stateâ€™s chief medical executive who already faced lesser charges, would become the sixth current or former official to face involuntary manslaughter charges in connection with the crisis. The charges stem from more than 80 cases of Legionnairesâ€™ disease and at least 12 deaths that were believed to be linked to the water in Flint after the city switched its source from Lake Huron to the Flint River in April 2014. Wells was among six current and former Michigan and Flint officials charged in June. The other five, including Michigan Health and Human Services Director Nick Lyon, were charged at the time with involuntary manslaughter",
    " WATCH: Fox News Host Loses Her Sh*t, Says Investigating Russia For Hacking Our Election Is Unpatriotic This woman is insane.In an incredibly disrespectful rant against President Obama and anyone else who supports investigating Russian interference in our election, Fox News host Jeanine Pirro said that anybody who is against Donald Trump is anti-American. Look, it s time to take sides,  she began.",
    " Sarah Palin Celebrates After White Man Who Pulled Gun On Black Protesters Goes Unpunished (VIDEO) Sarah Palin, one of the nigh-innumerable  deplorables  in Donald Trump s  basket,  almost outdid herself in terms of horribleness on Friday."
]

test_news_vectors = [preprocess_and_vectorize(n) for n in test_news]
clf.predict(test_news_vectors)

In [None]:
#finally print the confusion matrix for the best model (GradientBoostingClassifier)

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm


from matplotlib import pyplot as plt
import seaborn as sn
plt.figure(figsize = (10,7))
sn.heatmap(cm, annot=True, fmt='d')
plt.xlabel('Prediction')
plt.ylabel('Truth')