In [1]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
dfTrue = pd.read_csv("./Data/True.csv")
dfFake = pd.read_csv("./Data/Fake.csv")

In [3]:
dfTrue.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
dfTrue['label']=1
dfFake['label']=0

In [5]:
df = pd.concat([dfTrue, dfFake])

In [6]:
df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [7]:
set(df['label'])

{0, 1}

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.columns

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')

In [10]:
df.drop(['title','subject','date'], axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,text,label
0,WASHINGTON (Reuters) - The head of a conservat...,1
1,WASHINGTON (Reuters) - Transgender people will...,1
2,WASHINGTON (Reuters) - The special counsel inv...,1
3,WASHINGTON (Reuters) - Trump campaign adviser ...,1
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,1


# Text Preprocessing

In [12]:
import re
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def removeTag(text):
    cleaned = re.compile(r'<.*?>')
    return re.sub(cleaned," ",text)

def removeSpec(text):
    rem = ' '
    for i in text:
        if i.isalnum():
            rem+=i
        else:
            rem+=" "
    return rem

def removeStopWords(text):
    stopWords = set(stopwords.words('english'))
    words = word_tokenize(text)
    return [w for w in words if w not in stopWords]


def stemming(text):
    ss = SnowballStemmer('english')
    return " ".join([ss.stem(w) for w in text])

In [13]:
df['text'] = df['text'].apply(removeTag)
df['text'] = df['text'].apply(removeSpec)
df['text'] = df['text'].apply(removeStopWords)
df['text'] = df['text'].apply(stemming)

In [14]:
df['text'] = df['text'].str.lower()

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [26]:
tfidf = TfidfVectorizer(max_features=20000)
X = np.array(df.iloc[:,0].values)
y = np.array(df.iloc[:,1].values)
X = tfidf.fit_transform(X).toarray()
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=100)

In [27]:
from sklearn.naive_bayes import GaussianNB
GNB = GaussianNB()

In [28]:
GNB.fit(x_train,y_train)

GaussianNB()

In [29]:
predictions = GNB.predict(x_test)

In [30]:
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))


              precision    recall  f1-score   support

           0       0.91      0.88      0.89      7747
           1       0.87      0.90      0.89      7001

    accuracy                           0.89     14748
   macro avg       0.89      0.89      0.89     14748
weighted avg       0.89      0.89      0.89     14748



In [44]:
testNews = "today is was an unusal morning where we observed that the sun rises from west "

In [45]:
testNews = removeTag(testNews)
testNews=removeStopWords(testNews)
testNews = removeSpec(testNews)
testNews = stemming(testNews)


In [46]:
import pickle
bow,words = [],word_tokenize(testNews)
for word in words:
    bow.append(words.count(word))
word_dict = tfidf.vocabulary_
pickle.dump(word_dict,open('bow.pkl','wb'))

In [47]:
inp = []
for i in word_dict:
    inp.append(testNews.count(i[0]))
y_pred = GNB.predict(np.array(inp).reshape(1,20000))

In [48]:
print(y_pred)

[0]
