In [142]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [143]:
df=pd.read_csv(r'E:/ML/Datasets/news.csv')

In [144]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [145]:
df.isnull().sum()

Unnamed: 0    0
title         0
text          0
label         0
dtype: int64

#### Text preprocessing

In [146]:
#lower case
df['text']=df['text'].str.lower()
df['title']=df['title'].str.lower()

In [147]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,you can smell hillary’s fear,"daniel greenfield, a shillman journalism fello...",FAKE
1,10294,watch the exact moment paul ryan committed pol...,google pinterest digg linkedin reddit stumbleu...,FAKE
2,3608,kerry to go to paris in gesture of sympathy,u.s. secretary of state john f. kerry said mon...,REAL
3,10142,bernie supporters on twitter erupt in anger ag...,"— kaydee king (@kaydeeking) november 9, 2016 t...",FAKE
4,875,the battle of new york: why this primary matters,it's primary day in new york and front-runners...,REAL


In [148]:
# removing \n
df['text']=df['text'].apply(lambda x:x.replace('\n',''))

In [149]:
# remove punctuation
#slow technique
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [150]:
punc=string.punctuation

In [151]:
def remove_punc1(text):
    for char in punc:
        text=text.replace(char,'')
    return text

In [152]:
df['text'].apply(remove_punc1)

0       daniel greenfield a shillman journalism fellow...
1       google pinterest digg linkedin reddit stumbleu...
2       us secretary of state john f kerry said monday...
3       — kaydee king kaydeeking november 9 2016 the l...
4       its primary day in new york and frontrunners h...
                              ...                        
6330    the state department told the republican natio...
6331    the ‘p’ in pbs should stand for ‘plutocratic’ ...
6332     antitrump protesters are tools of the oligarc...
6333    addis ababa ethiopia —president obama convened...
6334    jeb bush is suddenly attacking trump heres why...
Name: text, Length: 6335, dtype: object

In [153]:
# fast technique

def remove_punc2(text):
    return text.translate(str.maketrans('','',punc))

In [154]:
df['text']=df['text'].apply(remove_punc2)

In [155]:
X=df['text']
y=df['label']

In [156]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=0.2,random_state=42)

#### Vectorization

In [157]:
vector=TfidfVectorizer(stop_words='english',max_df=0.7)

In [158]:
tfidf_xtrain=vector.fit_transform(xtrain)
tfidf_xtest=vector.transform(xtest)

#### RandomForestClassifierndomForestClassifier

In [159]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
rfc.fit(tfidf_xtrain,ytrain)

RandomForestClassifier()

In [160]:
pred=rfc.predict(tfidf_xtest)

In [161]:
accuracy_score(ytest,pred)

0.909234411996843

#### NaiveBays

In [162]:
from sklearn.naive_bayes import MultinomialNB
mnb=MultinomialNB()
mnb.fit(tfidf_xtrain,ytrain)

MultinomialNB()

In [163]:
ypre=mnb.predict(tfidf_xtest)

In [164]:
accuracy_score(ytest,ypre)

0.8602999210734017

#### PassiveAggressiveClassifier

In [165]:
pac=PassiveAggressiveClassifier(max_iter=50)
pac.fit(tfidf_xtrain,ytrain)
ypred=pac.predict(tfidf_xtest)

In [166]:
accuracy_score(ytest,ypred)

0.9384372533543804

In [167]:
confusion_matrix(ytest,ypred)

array([[586,  42],
       [ 36, 603]], dtype=int64)