## Importing libaries

In [3]:
pip install wordcloud

Collecting wordcloud
  Obtaining dependency information for wordcloud from https://files.pythonhosted.org/packages/f5/b0/247159f61c5d5d6647171bef84430b7efad4db504f0229674024f3a4f7f2/wordcloud-1.9.3-cp311-cp311-win_amd64.whl.metadata
  Downloading wordcloud-1.9.3-cp311-cp311-win_amd64.whl.metadata (3.5 kB)
Downloading wordcloud-1.9.3-cp311-cp311-win_amd64.whl (300 kB)
   ---------------------------------------- 0.0/300.2 kB ? eta -:--:--
   - -------------------------------------- 10.2/300.2 kB ? eta -:--:--
   ----- --------------------------------- 41.0/300.2 kB 495.5 kB/s eta 0:00:01
   ----------------------- ---------------- 174.1/300.2 kB 1.5 MB/s eta 0:00:01
   --------------------------- ------------ 204.8/300.2 kB 1.4 MB/s eta 0:00:01
   ---------------------------------------- 300.2/300.2 kB 1.6 MB/s eta 0:00:00
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.3
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import re
import emoji
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from wordcloud import WordCloud
import pickle

In [5]:
# Load the dataset
df = pd.read_csv(r"C:\Users\arsha\Downloads\archive (1)\fakenews.csv")

In [6]:
df.head()

Unnamed: 0,text,label
0,Get the latest from TODAY Sign up for our news...,1
1,2d Conan On The Funeral Trump Will Be Invited...,1
2,It’s safe to say that Instagram Stories has fa...,0
3,Much like a certain Amazon goddess with a lass...,0
4,At a time when the perfect outfit is just one ...,0


In [7]:
# EDA
def eda(data, column):
    lower = ' '.join(data[column]).islower()
    html = data[column].apply(lambda x: True if re.search('<.*?>',x) else False).sum()
    urls = data[column].apply(lambda x: True if re.search('http[s]?://.+?\S+',x) else False).sum()
    hasht = data[column].apply(lambda x: True if re.search('#\S+',x) else False).sum()
    mentions = data[column].apply(lambda x: True if re.search('@\S+',x) else False).sum()
    un_c = data[column].apply(lambda x: True if re.search("[]\.\*'\-#@$%^?~`!&,(0-9)]",x) else False).sum()
    emojiss = data[column].apply(lambda x: True if emoji.emoji_count(x) else False).sum()
    if not lower:
        print('Your data contains lower and upper case')
    if html > 0:
        print("Your data contains HTML tags")
    if urls > 0:
        print("Your data contains URLs")
    if hasht > 0:
        print("Your data contains hashtags")
    if mentions > 0:
        print("Your data contains mentions")
    if un_c:
        print("Your data contains unwanted characters")
    if emojiss:
        print("Your data contains emojis")

eda(df, "text")

Your data contains lower and upper case
Your data contains HTML tags
Your data contains URLs
Your data contains hashtags
Your data contains mentions
Your data contains unwanted characters
Your data contains emojis


In [8]:
#  Splitting the dataset
fv = df["text"]
cv = df["label"]

In [9]:
x_train, x_test, y_train, y_test = train_test_split(fv, cv, test_size=0.2, random_state=1, stratify=cv)


In [10]:
def basic_pp(x, emoj="F"):
    if emoj == "T":
        x = emoji.demojize(x)
    x = x.lower()
    x = re.sub('<.*?>', ' ', x)
    x = re.sub('http[s]?://.+?\S+', ' ', x)
    x = re.sub('#\S+', ' ', x)
    x = re.sub('@\S+', ' ', x)
    x = re.sub("[]\.\*'’‘_—,:{}\-#@$%^?~`!&(0-9)]", ' ', x)
    return x

In [11]:
def lemmat(x):
    sent = []
    ls = LancasterStemmer()
    for word in word_tokenize(x):
        sent.append(ls.stem(word))
    return " ".join(sent)

In [12]:
x_train_p = x_train.apply(basic_pp, args=("T",)).apply(lemmat)
x_test_p = x_test.apply(basic_pp, args=("T",)).apply(lemmat)


In [14]:
bbow = CountVectorizer(binary=True)
x_train_pf = bbow.fit_transform(x_train_p)
x_test_pf = bbow.transform(x_test_p)
# saving okl
with open(r'C:\Users\arsha\OneDrive\Desktop\pickle\countvectorizer.pkl', 'wb') as f:
    pickle.dump(bbow,f)



In [15]:
# bernoulli naive bayes
bnb = BernoulliNB(alpha=1)
pkl= bnb.fit(x_train_pf, y_train)
with open(r'C:\Users\arsha\OneDrive\Desktop\pickle\bernoulli.pkl', 'wb') as f:
    pickle.dump(pkl,f)


In [16]:
# multinomial NB
B_O_W =  CountVectorizer()
x_train_pf = B_O_W .fit_transform(x_train_p)
x_test_pf = B_O_W .transform(x_test_p)

# Save Multinomial NB model
mnb = MultinomialNB(alpha=1)
pkl = mnb.fit(x_train_pf, y_train)
with open(r'C:\Users\arsha\OneDrive\Desktop\pickle\multinomial.pkl', 'wb') as f:
    pickle.dump(pkl, f)


In [18]:
# using TF-IDF
tfidf = TfidfVectorizer()
x_train_pf = tfidf.fit_transform(x_train_p)
x_test_pf = tfidf.transform(x_test_p)
# Save TF-IDF model
with open(r'C:\Users\arsha\OneDrive\Desktop\pickle\TFIDF.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [19]:
# Save Multinomial NB model using TF-IDF
mnb = MultinomialNB(alpha=1)
model = mnb.fit(x_train_pf, y_train)
with open(r'C:\Users\arsha\OneDrive\Desktop\pickle\multinomialtf.pkl', 'wb') as f:
    pickle.dump(model, f)


In [20]:
# KNN using BOW
knn = KNeighborsClassifier(n_neighbors=1)
model = knn.fit(x_train_pf, y_train)
# Save KNN model using Bag of Words
with open(r'C:\Users\arsha\OneDrive\Desktop\pickle\knnBOW.pkl', 'wb') as f:
    pickle.dump(model, f)

In [21]:
# knn with TF-IDF
knn = KNeighborsClassifier(n_neighbors=1)
model = knn.fit(x_train_pf, y_train)
# Save KNN model
with open(r'C:\Users\arsha\OneDrive\Desktop\pickle\knnTFIDF.pkl', 'wb') as f:
    pickle.dump(model, f)