In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
true = pd.read_csv('true.csv')
fake = pd.read_csv('fake.csv')

In [3]:
true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
print(true.isnull().sum().sum())
print(fake.isnull().sum().sum())

0
0


In [6]:
print(true.describe)
print(fake.describe)

<bound method NDFrame.describe of                                                    title  \
0      As U.S. budget fight looms, Republicans flip t...   
1      U.S. military to accept transgender recruits o...   
2      Senior U.S. Republican senator: 'Let Mr. Muell...   
3      FBI Russia probe helped by Australian diplomat...   
4      Trump wants Postal Service to charge 'much mor...   
...                                                  ...   
21412  'Fully committed' NATO backs new U.S. approach...   
21413  LexisNexis withdrew two products from Chinese ...   
21414  Minsk cultural hub becomes haven from authorities   
21415  Vatican upbeat on possibility of Pope Francis ...   
21416  Indonesia to buy $1.14 billion worth of Russia...   

                                                    text       subject  \
0      WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1      WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2      WASHINGTON (Reut

In [7]:
true.drop(columns = 'date', inplace = True)
fake.drop(columns = 'date', inplace = True)

In [8]:
true['Class'] = 0
fake['Class'] = 1

In [9]:
true.head()

Unnamed: 0,title,text,subject,Class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,0


In [10]:
df = pd.concat([true,fake],ignore_index = True)

In [11]:
df.head()

Unnamed: 0,title,text,subject,Class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,0


In [12]:
label_encoder = LabelEncoder()
df['subject'] = label_encoder.fit_transform(df['subject'])

In [13]:
df.head()

Unnamed: 0,title,text,subject,Class
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,6,0
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,6,0
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,6,0
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,6,0
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,6,0


In [14]:
df['content'] = df['title']+' '+df['text']

In [15]:
df = df.drop(columns = 'title')
df =df.drop(columns = 'text')

In [16]:
df.head()

Unnamed: 0,subject,Class,content
0,6,0,"As U.S. budget fight looms, Republicans flip t..."
1,6,0,U.S. military to accept transgender recruits o...
2,6,0,Senior U.S. Republican senator: 'Let Mr. Muell...
3,6,0,FBI Russia probe helped by Australian diplomat...
4,6,0,Trump wants Postal Service to charge 'much mor...


In [19]:
port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [22]:
df['content'] = df['content'].apply(stemming)

In [23]:
X = df['content'].values
y = df['Class'].values

In [24]:
print(X)
print('=='*20)
print(y)

['u budget fight loom republican flip fiscal script washington reuter head conserv republican faction u congress vote month huge expans nation debt pay tax cut call fiscal conserv sunday urg budget restraint keep sharp pivot way among republican u repres mark meadow speak cb face nation drew hard line feder spend lawmak brace battl januari return holiday wednesday lawmak begin tri pass feder budget fight like link issu immigr polici even novemb congression elect campaign approach republican seek keep control congress presid donald trump republican want big budget increas militari spend democrat also want proport increas non defens discretionari spend program support educ scientif research infrastructur public health environment protect trump administr alreadi will say go increas non defens discretionari spend percent meadow chairman small influenti hous freedom caucu said program democrat say enough need give govern pay rais percent fiscal conserv see rational eventu run peopl money sa

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer # imports the TfidfVectorizer class

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [27]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,stratify=y,random_state = 42)

In [28]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier()
}
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train,y_train)
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_accuracy = accuracy_score(y_train_pred,y_train)
    model_test_accuracy = accuracy_score(y_test_pred,y_test)

    print(list(models.keys())[i])
    print('Model Accuracy on Train Data:',model_train_accuracy)
    print('Model Accuracy on Test Data:',model_test_accuracy)
    print('=='*30)
    print('\n')

Logistic Regression
Model Accuracy on Train Data: 0.9919260537891865
Model Accuracy on Test Data: 0.9869710467706013


Random Forest
Model Accuracy on Train Data: 1.0
Model Accuracy on Test Data: 0.9925389755011136


Decision Tree
Model Accuracy on Train Data: 1.0
Model Accuracy on Test Data: 0.9955456570155902




In [29]:
X_new = X_test[0]
prediction = model.predict(X_new)
print(prediction)

if prediction[0]==0:
    print('The news is real')
else:
    print('The news is real')

[1]
The news is real


In [34]:
y_test[0]

1

In [35]:
import pickle
pickle.dump(model,open('fake_detection.pkl','wb'))