In [2]:
! python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [4]:
doc = nlp('dog cat banana pine')

for token in doc:
  print(token.text,'Vector:',token.has_vector,'OOV:',token.is_oov)

dog Vector: True OOV: False
cat Vector: True OOV: False
banana Vector: True OOV: False
pine Vector: True OOV: False


In [5]:
doc[0].vector.shape

(300,)

In [6]:
base_token = nlp('bread')
base_token[0].vector.shape

(300,)

In [7]:
doc = nlp('bread sandwich burger car tiger human wheat')

for token in doc:
  print(f"{token.text} -- {base_token.text}:", token.similarity(base_token))

bread -- bread: 1.0
sandwich -- bread: 0.6874560117721558
burger -- bread: 0.544037401676178
car -- bread: 0.16441147029399872
tiger -- bread: 0.14492356777191162
human -- bread: 0.21103660762310028
wheat -- bread: 0.6572456359863281


In [8]:
def print_similarity(base_word,words_to_compare):
  base_token = nlp(base_word)
  doc = nlp(words_to_compare)
  for token in doc:
    print(f"{token.text} -- {base_token.text}:", token.similarity(base_token))



In [9]:
print_similarity('Iphone','apple samsung iphone dog kitten')

apple -- Iphone: 0.6339781284332275
samsung -- Iphone: 0.6678677797317505
iphone -- Iphone: 1.0
dog -- Iphone: 0.1743103712797165
kitten -- Iphone: 0.1468581259250641


# News classification using Spacy word Vectors

In [10]:
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("clmentbisaillon/fake-and-real-news-dataset")

print("Path to dataset files:", os.listdir(path))

Downloading from https://www.kaggle.com/api/v1/datasets/download/clmentbisaillon/fake-and-real-news-dataset?dataset_version_number=1...


100%|██████████| 41.0M/41.0M [00:00<00:00, 157MB/s]

Extracting files...





Path to dataset files: ['True.csv', 'Fake.csv']


In [11]:
import pandas as pd

df_true = pd.read_csv(f'{path}/True.csv')
df_true['label'] = 'Real'
df_false = pd.read_csv(f'{path}/Fake.csv')
df_false['label'] = 'Fake'

In [24]:
df = pd.concat([df_true.head(750),df_false.head(750)],axis = 0)

In [25]:
#df.reset_index()
df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",Real
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",Real
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",Real
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",Real
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",Real


In [26]:
df.drop(['title','subject','date'],axis = 1,inplace = True)

In [27]:
df.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,Real
1,WASHINGTON (Reuters) - Transgender people will...,Real
2,WASHINGTON (Reuters) - The special counsel inv...,Real
3,WASHINGTON (Reuters) - Trump campaign adviser ...,Real
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,Real


In [28]:
df.shape

(1500, 2)

In [29]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Real,750
Fake,750


In [39]:
df['label_num'] = df['label'].map({'Fake':0, 'Real':1})
df.head()

Unnamed: 0,text,label,label_num,vector,label_num.1
0,WASHINGTON (Reuters) - The head of a conservat...,Real,1,"[-0.07704518, 0.13792795, -0.029863376, -0.043...",1
1,WASHINGTON (Reuters) - Transgender people will...,Real,1,"[-0.0056013362, 0.10255298, -0.04037715, -0.01...",1
2,WASHINGTON (Reuters) - The special counsel inv...,Real,1,"[-0.07198532, 0.13710167, -0.023083739, 0.0073...",1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,Real,1,"[-0.05529037, 0.13536039, -0.024868956, -0.006...",1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,Real,1,"[-0.09234671, 0.20955735, -0.02266693, -0.0679...",1


In [40]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [34]:
df['vector'] = df['text'].apply(lambda x: nlp(x).vector)

In [41]:
df.head()

Unnamed: 0,text,label,label_num,vector,label_num.1
0,WASHINGTON (Reuters) - The head of a conservat...,Real,1,"[-0.07704518, 0.13792795, -0.029863376, -0.043...",1
1,WASHINGTON (Reuters) - Transgender people will...,Real,1,"[-0.0056013362, 0.10255298, -0.04037715, -0.01...",1
2,WASHINGTON (Reuters) - The special counsel inv...,Real,1,"[-0.07198532, 0.13710167, -0.023083739, 0.0073...",1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,Real,1,"[-0.05529037, 0.13536039, -0.024868956, -0.006...",1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,Real,1,"[-0.09234671, 0.20955735, -0.02266693, -0.0679...",1


In [44]:
#Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(df['vector'].values,df['label_num'],
                                                 test_size=0.2,
                                                 random_state=18,stratify=df['label_num'])

In [46]:
X_train.shape

(1200,)

In [48]:
X_train[:1]

array([array([-2.65638307e-02,  1.58049151e-01, -9.27746519e-02, -7.39117563e-02,
               6.40576109e-02,  1.14384969e-03,  1.77273471e-02, -9.04249474e-02,
              -4.02566157e-02,  2.16493297e+00, -1.41631156e-01, -1.62559636e-02,
               5.68763204e-02, -5.51491007e-02, -1.49540678e-01, -6.54765442e-02,
              -6.51527792e-02,  9.05229688e-01, -1.42073229e-01, -2.45162603e-02,
               1.56008545e-02, -3.29624191e-02, -2.71693221e-03, -5.22734560e-02,
              -4.23799874e-03,  4.91250716e-02, -1.11799143e-01, -5.35087772e-02,
               2.44905222e-02, -3.91559377e-02, -4.09951210e-02,  9.09271017e-02,
              -4.37626466e-02,  4.23011854e-02,  8.01936314e-02, -7.67684504e-02,
               1.82028301e-02,  6.09563887e-02, -5.43026663e-02, -4.95606586e-02,
              -3.44639048e-02,  6.66460618e-02,  1.81798153e-02, -5.20692915e-02,
               3.98224965e-02,  5.18737547e-02, -1.51990145e-01, -3.70120108e-02,
              -3

In [49]:
import numpy as np

X_train_2d = np.stack(X_train)
X_test_2d  = np.stack(X_test)

In [51]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf= MultinomialNB()
clf.fit(scaled_train_embed,y_train)

In [55]:
from sklearn.metrics import classification_report
y_pred = clf.predict(scaled_test_embed)


print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.96      0.96       150
           1       0.96      0.97      0.96       150

    accuracy                           0.96       300
   macro avg       0.96      0.96      0.96       300
weighted avg       0.96      0.96      0.96       300



In [56]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=5,metric='euclidean')

clf.fit(X_train_2d,y_train)

y_pred = clf.predict(X_test_2d)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99       150
           1       0.98      0.99      0.99       150

    accuracy                           0.99       300
   macro avg       0.99      0.99      0.99       300
weighted avg       0.99      0.99      0.99       300



# Excercise

## Loading The Dataset

In [98]:
import pandas as pd

df = pd.read_json('News_Category_Dataset.json',lines=True)
df.head(2)

Unnamed: 0,short_description,headline,date,link,authors,category
0,She left her husband. He killed their children...,There Were 2 Mass Shootings In Texas Last Week...,2018-05-26,https://www.huffingtonpost.com/entry/texas-ama...,Melissa Jeltsen,CRIME
1,Of course it has a song.,Will Smith Joins Diplo And Nicky Jam For The 2...,2018-05-26,https://www.huffingtonpost.com/entry/will-smit...,Andy McDonald,ENTERTAINMENT


In [99]:
df = df.sample(5000)
df.reset_index(inplace=True)

In [100]:
df.shape

(5000, 7)

In [101]:
#df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
POLITICS,1279
ENTERTAINMENT,543
HEALTHY LIVING,278
QUEER VOICES,212
BUSINESS,180
BLACK VOICES,156
SPORTS,154
PARENTS,154
COMEDY,153
WOMEN,141


In [104]:
new_df = df[df['category'].isin(['CRIME', 'SPORTS', 'BUSINESS'])]
new_df.head()

Unnamed: 0,index,short_description,headline,date,link,authors,category
1,14981,The league recently reminded teams about the r...,New York Knicks Lock Arms During National Anth...,2017-10-04,https://www.huffingtonpost.com/entry/new-york-...,Carla Herreria,SPORTS
17,51563,The billionaire wants to marry Tesla and Solar...,The One Company Elon Musk Wants To Keep Indepe...,2016-08-04,https://www.huffingtonpost.com/entry/tesla-spa...,Alexander C. Kaufman,BUSINESS
18,113334,,CUT,2014-08-30,https://www.huffingtonpost.com/entry/michael-s...,Michael Klopman,SPORTS
40,115441,,Why Walmart Had to Fire U.S. Head Bill Simon,2014-08-06,https://www.huffingtonpost.com/entry/why-walma...,"George Bradt, ContributorChairman, PrimeGenesis",BUSINESS
67,41229,Dozens of arrests for civil disobedience mark ...,Workers Across U.S. 'Fight For $15' In Strikes...,2016-11-29,https://www.huffingtonpost.com/entry/minimum-w...,Dave Jamieson,BUSINESS


In [105]:
new_df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
BUSINESS,180
SPORTS,154
CRIME,109


In [106]:
#Add the new column "label_num" which gives a unique number to each of these labels

new_df['label_num'] = new_df['category'].map({'BUSINESS':1,
                                                'SPORTS':2,
                                                'CRIME':3})

new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['label_num'] = new_df['category'].map({'BUSINESS':1,


Unnamed: 0,index,short_description,headline,date,link,authors,category,label_num
1,14981,The league recently reminded teams about the r...,New York Knicks Lock Arms During National Anth...,2017-10-04,https://www.huffingtonpost.com/entry/new-york-...,Carla Herreria,SPORTS,2
17,51563,The billionaire wants to marry Tesla and Solar...,The One Company Elon Musk Wants To Keep Indepe...,2016-08-04,https://www.huffingtonpost.com/entry/tesla-spa...,Alexander C. Kaufman,BUSINESS,1
18,113334,,CUT,2014-08-30,https://www.huffingtonpost.com/entry/michael-s...,Michael Klopman,SPORTS,2
40,115441,,Why Walmart Had to Fire U.S. Head Bill Simon,2014-08-06,https://www.huffingtonpost.com/entry/why-walma...,"George Bradt, ContributorChairman, PrimeGenesis",BUSINESS,1
67,41229,Dozens of arrests for civil disobedience mark ...,Workers Across U.S. 'Fight For $15' In Strikes...,2016-11-29,https://www.huffingtonpost.com/entry/minimum-w...,Dave Jamieson,BUSINESS,1


In [107]:
#import spacy and load the language model downloaded
import spacy

nlp = spacy.load('en_core_web_lg')

In [108]:
def preprocess(text):
  doc = nlp(text)
  filtered_tokens = []
  for token in doc:
    if token.is_stop or token.is_punct:
      continue
    filtered_tokens.append(token.lemma_)
  return " ".join(filtered_tokens)

In [109]:
new_df['preprocesed_text'] = new_df['short_description'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['preprocesed_text'] = new_df['short_description'].apply(preprocess)


In [110]:
new_df['preprocesed_text'][1]

'league recently remind team rule stand anthem'

In [116]:
new_df['vector']= new_df['preprocesed_text'].apply(lambda text: nlp(text).vector)
new_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['vector']= new_df['preprocesed_text'].apply(lambda text: nlp(text).vector)


Unnamed: 0,index,short_description,headline,date,link,authors,category,label_num,preprocesed_text,vector
1,14981,The league recently reminded teams about the r...,New York Knicks Lock Arms During National Anth...,2017-10-04,https://www.huffingtonpost.com/entry/new-york-...,Carla Herreria,SPORTS,2,league recently remind team rule stand anthem,"[0.053254288, 0.19611713, -0.011089002, 0.0192..."
17,51563,The billionaire wants to marry Tesla and Solar...,The One Company Elon Musk Wants To Keep Indepe...,2016-08-04,https://www.huffingtonpost.com/entry/tesla-spa...,Alexander C. Kaufman,BUSINESS,1,billionaire want marry Tesla SolarCity say Spa...,"[0.08825722, -0.0023966762, -0.043472674, 0.05..."
18,113334,,CUT,2014-08-30,https://www.huffingtonpost.com/entry/michael-s...,Michael Klopman,SPORTS,2,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
40,115441,,Why Walmart Had to Fire U.S. Head Bill Simon,2014-08-06,https://www.huffingtonpost.com/entry/why-walma...,"George Bradt, ContributorChairman, PrimeGenesis",BUSINESS,1,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
67,41229,Dozens of arrests for civil disobedience mark ...,Workers Across U.S. 'Fight For $15' In Strikes...,2016-11-29,https://www.huffingtonpost.com/entry/minimum-w...,Dave Jamieson,BUSINESS,1,dozen arrest civil disobedience mark campaign ...,"[-0.09719099, -0.029954279, 0.17079061, 0.1135..."


In [118]:
#Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(new_df['vector'].values,new_df['label_num'],
                                                 test_size=0.2,
                                                 random_state=18,stratify=new_df['label_num'])

In [119]:
X_train.shape

(354,)

In [120]:
X_test

array([array([ 5.76823205e-02,  3.39339972e-02, -1.82047755e-01, -2.41447482e-02,
              -6.89867437e-02, -1.06922500e-02, -6.39749989e-02, -3.63617484e-03,
               1.81307480e-01,  2.52827501e+00, -2.67278492e-01, -6.35862499e-02,
              -1.20624993e-03,  1.01600870e-01, -1.23653993e-01,  1.16112500e-01,
              -1.21963248e-01,  1.11226249e+00, -4.12252486e-01,  1.86042562e-02,
              -5.31126782e-02, -1.42839998e-02,  2.18931496e-01, -4.46224958e-03,
               2.49049813e-03,  1.22832499e-01,  9.43115056e-02, -1.43048003e-01,
               1.68550253e-01,  3.29524986e-02, -2.62800008e-01, -2.13492006e-01,
              -2.07778245e-01, -1.34180486e-01,  2.18727499e-01, -4.54995744e-02,
               1.29101306e-01,  5.88000193e-03, -9.76424962e-02,  6.57174885e-02,
              -1.29234999e-01,  1.60202503e-01, -6.28250018e-02, -4.27612998e-02,
               1.09218501e-01, -2.10677497e-02, -2.61847019e-01, -1.40749991e-01,
              -2

# The X_train and X_test array storing array inside so reshaping it .. so as to fit for models

In [121]:
import numpy as np
X_train_2d =  np.stack(X_train)
X_test_2d = np.stack(X_test)

# Attempt 1 MULTINOMIALNB

In [122]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler


scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)


clf= MultinomialNB()
clf.fit(scaled_train_embed,y_train)

## predicton and classification report
from sklearn.metrics import classification_report
y_pred = clf.predict(scaled_test_embed)


print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.56      0.92      0.69        36
           2       0.85      0.55      0.67        31
           3       0.80      0.36      0.50        22

    accuracy                           0.65        89
   macro avg       0.74      0.61      0.62        89
weighted avg       0.72      0.65      0.64        89



# attempt Decision Tree

In [126]:
from sklearn.tree import DecisionTreeClassifier


clf = DecisionTreeClassifier()

clf.fit(X_train_2d,y_train)

## predicton and classification report
from sklearn.metrics import classification_report
y_pred = clf.predict(scaled_test_embed)


print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           1       0.00      0.00      0.00        36
           2       0.35      1.00      0.52        31
           3       0.00      0.00      0.00        22

    accuracy                           0.35        89
   macro avg       0.12      0.33      0.17        89
weighted avg       0.12      0.35      0.18        89



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KNN

In [127]:
from sklearn.neighbors import KNeighborsClassifier


clf = KNeighborsClassifier()

clf.fit(X_train_2d,y_train)

## predicton and classification report
from sklearn.metrics import classification_report
y_pred = clf.predict(scaled_test_embed)


print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           1       0.33      0.06      0.10        36
           2       0.33      0.87      0.48        31
           3       0.00      0.00      0.00        22

    accuracy                           0.33        89
   macro avg       0.22      0.31      0.19        89
weighted avg       0.25      0.33      0.20        89



##### Randome forest classifier

In [128]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler



clf = RandomForestClassifier()

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf.fit(scaled_train_embed,y_train)

## predicton and classification report
from sklearn.metrics import classification_report
y_pred = clf.predict(scaled_test_embed)


print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           1       0.86      0.69      0.77        36
           2       0.76      0.71      0.73        31
           3       0.52      0.73      0.60        22

    accuracy                           0.71        89
   macro avg       0.71      0.71      0.70        89
weighted avg       0.74      0.71      0.72        89



In [129]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.preprocessing import MinMaxScaler



clf = GradientBoostingClassifier()

scaler = MinMaxScaler()
scaled_train_embed = scaler.fit_transform(X_train_2d)
scaled_test_embed = scaler.transform(X_test_2d)

clf.fit(scaled_train_embed,y_train)

## predicton and classification report
from sklearn.metrics import classification_report
y_pred = clf.predict(scaled_test_embed)


print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           1       0.78      0.69      0.74        36
           2       0.83      0.65      0.73        31
           3       0.48      0.73      0.58        22

    accuracy                           0.69        89
   macro avg       0.70      0.69      0.68        89
weighted avg       0.73      0.69      0.69        89

