# Word Embeddings using Spacy
# End-to-end project of Word Embeddings


# NLP: Text Classification using Spacy word Embeddings

# end-to-end project: fake and real news data

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
root_dir = "/content/drive/My Drive/Colab Notebooks/"
os.chdir(root_dir)

In [None]:
import pandas as pd
#read the dataset using pandas
df = pd.read_csv("/content/fake_and_real_news.csv")

##### for local
# filepath = "/content/drive/My Drive/Colab Notebooks/"
# filename = "fake_and_real_news.csv"
# import os
# os.path.join(filepath,filename)
# df = pd.read_csv(os.path.join(filepath,filename))  #local

In [None]:
#print dataframe
print(df.head(10))

                                                Text label
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake
1  U.S. conservative leader optimistic of common ...  Real
2  Trump proposes U.S. tax overhaul, stirs concer...  Real
3   Court Forces Ohio To Allow Millions Of Illega...  Fake
4  Democrats say Trump agrees to work on immigrat...  Real
5  France says pressure needed to stop North Kore...  Real
6  Trump on Twitter (August 8): Opioid crisis, No...  Real
7   BUSTED: Trump Supporter Used Poll Watcher Cre...  Fake
8  Fatal Niger operation sparks calls for public ...  Real
9  Trump says he has 'great heart' for immigrant ...  Real


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
print(df.shape)

(9900, 2)


In [None]:
#check imbalance in data set
# check the distribution of data in our dataset
df['label'].value_counts()

# df["class"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Fake,5000
Real,4900


In [None]:
df['label_num'] = df['label'].map(
    {
     'Fake': 0,
     'Real':1
    }
)

In [None]:
print(df.head(10))

                                                Text label  label_num
0   Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake          0
1  U.S. conservative leader optimistic of common ...  Real          1
2  Trump proposes U.S. tax overhaul, stirs concer...  Real          1
3   Court Forces Ohio To Allow Millions Of Illega...  Fake          0
4  Democrats say Trump agrees to work on immigrat...  Real          1
5  France says pressure needed to stop North Kore...  Real          1
6  Trump on Twitter (August 8): Opioid crisis, No...  Real          1
7   BUSTED: Trump Supporter Used Poll Watcher Cre...  Fake          0
8  Fatal Niger operation sparks calls for public ...  Real          1
9  Trump says he has 'great heart' for immigrant ...  Real          1


In [None]:
import spacy
!python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Convert text to vector

df['vector'] = df['Text'].apply(lambda text: nlp(text).vector)

In [None]:
len(df)

In [None]:
print(df.head())

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split (
    df.vector.values,
    df.label_num,
    test_size = 0.2,
    random_state = 42
)

In [None]:
X_train.shape

(7920,)

In [None]:
X_test.shape

(1980,)

In [None]:
import numpy as np
X_train_stack = np.stack(X_train)
X_test_stack = np.stack(X_test)

In [None]:
print(X_train_stack.shape)
print(X_test_stack.shape)

(7920, 300)
(1980, 300)


In [None]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_stack_sc = scaler.fit_transform(X_train_stack)
X_test_stack_sc = scaler.transform(X_test_stack)
model.fit(X_train_stack_sc, y_train)

In [None]:
y_pred = model.predict(X_test_stack_sc)
y_pred

array([0, 1, 1, ..., 0, 1, 1])

In [None]:
y_test

8432    0
5680    1
4767    1
9218    1
621     0
       ..
9500    1
5858    1
7442    0
2846    1
1468    1
Name: label_num, Length: 1980, dtype: int64

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9474747474747475

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.94      0.95       973
           1       0.94      0.96      0.95      1007

    accuracy                           0.95      1980
   macro avg       0.95      0.95      0.95      1980
weighted avg       0.95      0.95      0.95      1980



In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[913,  60],
       [ 44, 963]])