## Importing pandas an numpy

In [1]:
import numpy as np # linear algebra
import pandas as pd

## load data using pandas

In [2]:
true = pd.read_csv("True.csv")
true["class"]= 1
fake = pd.read_csv("Fake.csv")
fake["class"] = 0

news = pd.concat([true,fake],axis=0,ignore_index=True)


## CLEANING DATASET BY DELETING THE USELESS FEATURES AND NAN VALUES 

In [3]:

del news["date"]
del news["subject"]


nan_value_portion =news.isna().sum()/ news.count()
print(nan_value_portion)

news_clean = news.dropna()
print(news_clean.isna().sum())

title    0.0
text     0.0
class    0.0
dtype: float64
title    0
text     0
class    0
dtype: int64


# shaffle the data

In [4]:
shaffled_news = news_clean.sample(frac=1,random_state=42)
print(shaffled_news["class"])

22216    0
27917    0
25007    0
1377     1
32476    0
        ..
11284    1
44732    0
38158    0
860      1
15795    1
Name: class, Length: 44898, dtype: int64


## remove text noise punctuation marks

In [5]:
import string
import re 

def clean(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '  ', text)
    text1= re.sub(r'\s+',' ',text)
    return text1

X = shaffled_news["text"].apply(lambda x:clean(x))
Y = shaffled_news["class"]
print(X[0])

washington reuters the head of a conservative republican faction in the u s congress who voted this month for a huge expansion of the national debt to pay for tax cuts called himself a fiscal conservative on sunday and urged budget restraint in in keeping with a sharp pivot under way among republicans u s representative mark meadows speaking on cbs face the nation drew a hard line on federal spending which lawmakers are bracing to do battle over in january when they return from the holidays on wednesday lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues such as immigration policy even as the november congressional election campaigns approach in which republicans will seek to keep control of congress president donald trump and his republicans want a big budget increase in military spending while democrats also want proportional increases for non defense discretionary spending on programs that support education scientific research infrastr

# REMOVE STOP WORDS **

In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

all_stopwords = set(stopwords.words('english'))
exceptions = {"no","not"}
custom_stopwords = all_stopwords - exceptions
stemmer = PorterStemmer()
def remove_stopwords(text): 
    tokens = nltk.word_tokenize(text)
    
    filtered_tokens = [word for word in tokens if word not in custom_stopwords]
   
    return filtered_tokens

X = X.apply(lambda x : remove_stopwords(x))

print(X[3])

['washington', 'reuters', 'trump', 'campaign', 'adviser', 'george', 'papadopoulos', 'told', 'australian', 'diplomat', 'may', 'russia', 'political', 'dirt', 'democratic', 'presidential', 'candidate', 'hillary', 'clinton', 'new', 'york', 'times', 'reported', 'saturday', 'conversation', 'papadopoulos', 'diplomat', 'alexander', 'downer', 'london', 'driving', 'factor', 'behind', 'fbi', 'decision', 'open', 'counter', 'intelligence', 'investigation', 'moscow', 'contacts', 'trump', 'campaign', 'times', 'reported', 'two', 'months', 'meeting', 'australian', 'officials', 'passed', 'information', 'came', 'papadopoulos', 'american', 'counterparts', 'leaked', 'democratic', 'emails', 'began', 'appearing', 'online', 'according', 'newspaper', 'cited', 'four', 'current', 'former', 'u', 'foreign', 'officials', 'besides', 'information', 'australians', 'probe', 'federal', 'bureau', 'investigation', 'also', 'propelled', 'intelligence', 'friendly', 'governments', 'including', 'british', 'dutch', 'times', 'sa

## CONVERT TOKENS TO STRING

In [7]:
X_t = [' '.join(arr) for arr in X]
print(X_t[10])

listening several republican candidates president one could wonder cryogenic sleep past seven years still talking badly george w bush screwed things however no keep repeating lie nation much worse brink recession president obama took republican voters honestly insulted treated like idiots absent reality although maybe idiots absent reality republican candidates opportunistic repeating lie seven disastrous years obama marco rubio took media branch gop fox news give quick campaign message offer well wishes new year hi marco rubio senator florida candidate president united states leave behind look forward elections coming november year going chance turn america around seven disastrous years current president chance reclaim american dream running president resolution everything not restore american dream expand reach people change lives ever greatest country world greater ever america rubio want go back exactly one people losing homes one jobs lost rate per month one people could denied he

## SPLIT DATASET

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, Y_train, Y_test = train_test_split(X_t, Y, test_size=0.2)


## IMPORT LIBRARIES FOR TEXT VECTORIZING MODE , AND METRICS

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score


## VECTORIZE TRAIN INPUT AND TEST INPUT 

In [10]:
vectorizer1 = TfidfVectorizer()

x_tr_vec= vectorizer1.fit_transform(x_train)
x_val_vec = vectorizer1.transform(x_test)
print(vectorizer1.vocabulary_["trump"])

94953


## UNDERSTANDING HOW VECTORIZING WORK BY PRINT THE OUTPUTS SHAPES 

In [11]:
print(x_tr_vec.shape)
print(x_val_vec.shape)
print(vectorizer1.transform(["trump white house immigration "]).shape)

(35918, 105867)
(8980, 105867)
(1, 105867)


In [12]:
from sklearn.naive_bayes import BernoulliNB

classifier = BernoulliNB()
classifier.fit(x_tr_vec, Y_train)

# 5. Evaluate the classifier
y_pred = classifier.predict(x_val_vec)
accuracy = accuracy_score(Y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.965478841870824


## Make predictions on new data

In [15]:
new_text = 'he relinquished the unenviable job of White House press secretary, Sean Spicer was tasked with convincing reporters that Trump’s inauguration had drawn the largest audience to ever witness an inauguration, period, both in person and around the globe After a comparison of Barack Obama’s 2009 inauguration crowd size and Trump’s went viral, an analysis of both crowds showed that Trump’s was about one-third the size of Obama’s Following his resignation, Spicer walked back the first assertion he’d made on the job, saying in an interview that he had screwed up a number of times press secretary'
new_text = new_text.lower()

new_text_vectorized = vectorizer1.transform([new_text])

print(classifier.predict(new_text_vectorized))



[0]


## SAVE THE MODEL  and the vectorizor

In [14]:
from joblib import dump
dump(classifier, 'model_fake.joblib')
dump(vectorizer1,'vectorizer.joblib')

['vectorizer.joblib']