In [1]:
#importing libraries

import pandas as pd
import numpy as np
import re # to search for words from an extract
import nltk
from nltk.corpus import stopwords # to find unnecessary words
from nltk.stem.porter import PorterStemmer #to perform stemming that finds the root word of each word
from sklearn.feature_extraction.text import TfidfVectorizer#helps to deal with most frequent words and measures how often a word occurs in the dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Download all the stopwords from the nltk library

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\avata\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# View the English stopwords that will be unnecessary for our analysis

print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [2]:
# Load the dataset and viewing first 5 rows

dataframe = pd.read_csv(r"C:\Users\avata\Desktop\Fake News Project\train.csv")
dataframe.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [3]:
dataframe.shape # the dataset contains 20800 rows and 5 columns

(20800, 5)

In [4]:
#  general information about the dataset

dataframe.info() # we understand that there are a number of null values in certain columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
#check for number of null values in each column
dataframe.isna().sum() # we have null values for 'title', 'author', and 'text'

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
# Filling null values with empty space

dataframe = dataframe.fillna('')

In [8]:
#dropping the 'id' column as it is not significant for further analysis
dataframe.drop(columns=['id'], inplace=True)

In [9]:
dataframe["author"].value_counts()

                                        1957
Pam Key                                  243
admin                                    193
Jerome Hudson                            166
Charlie Spiering                         141
                                        ... 
David E. Sanger and Rick Gladstone         1
Nathaniel Popper and Michael Corkery       1
SloMoe                                     1
Neil MacFarquhar and David E. Sanger       1
WB                                         1
Name: author, Length: 4202, dtype: int64

In [10]:
len(dataframe["author"].unique())

4202

In [14]:
# we have 4202 authors in the dataset and highest number of articles belong to Pam Key

In [11]:
# Defining the dependent(Y) and independent(X) variables

x = dataframe['text']
y = dataframe['label']

In [13]:
#split into train and test
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)


In [14]:
tfvect = TfidfVectorizer(stop_words='english',max_df=0.7)
tfid_x_train = tfvect.fit_transform(x_train)
tfid_x_test = tfvect.transform(x_test)

In [16]:
# Building Logistic Regression Model

classifier = LogisticRegression()

# Fit & Train the model
classifier.fit(tfid_x_train,y_train)

LogisticRegression()

In [17]:
y_pred = classifier.predict(tfid_x_test)
score = accuracy_score(y_test,y_pred)
print(f'Test Accuracy: {round(score*100,2)}%')

Test Accuracy: 94.47%


In [18]:
y_pred1 = classifier.predict(tfid_x_train)
score = accuracy_score(y_train,y_pred1)
print(f'Train Accuracy: {round(score*100,2)}%')

Train Accuracy: 97.4%


In [20]:
cf = confusion_matrix(y_test,y_pred, labels=[1,0])
print(cf)

[[2010  104]
 [ 126 1920]]


In [21]:
def fake_news_det(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    prediction = classifier.predict(vectorized_input_data)
    print(prediction)

In [25]:
fake_news_det('I love all women, except for the fat ones, the ugly ones and the feminists!')

[1]


In [24]:
fake_news_det('Hillary Clinton spent a whole afternoon drunk and unresponsive while her campaign staff tried to reach her, a new WikiLeaks email reveals. ')

[1]


In [26]:
fake_news_det('Just days before the anniversary of Roe v. Wade, Pope Francis has sent a message of support to the March for Life taking place in Paris, France, on Sunday. [â€œThe Church must never tire of being an advocate for life and must not neglect to proclaim that human life is to be protected unconditionally from the moment of conception until natural death,â€ the Popeâ€™s message said.  Pope Francis has been a vocal critic of the abortion industry, comparing it to King Herodâ€™s slaughter of the innocents at the time of Jesus, and accusing the abortion lobby of working with a   mentality that seeks to eliminate those who get in oneâ€™s way. In his message to   marchers, Francis expressed his solidarity with their efforts and urged them to continue to proclaim the value of human life. â€œBeyond this legitimate manifestation in defense of human life, the Holy Father encourages participants in the March for Life to work tirelessly for the building of a civilization of love and a culture of life,â€ said the message, which was sent through the Apostolic Nuncio to France, Archbishop Luigi Ventura. On Sunday, President Trump is expected to sign an executive order cutting funding to the International Planned Parenthood Foundation, in a restoration of Ronald Reaganâ€™s â€œMexico City policyâ€ that banned U. S. government funding of abortion around the world. The executive order would reportedly be timed to coincide with the anniversary of the 1973 Roe v. Wade Supreme Court decision that legalized    on all 50 states in the Union. Follow Thomas D. Williams on Twitter Follow @tdwilliamsrome')

[0]


In [27]:
import pickle
pickle.dump(classifier,open('model.pkl', 'wb'))

# load the model from disk
loaded_model = pickle.load(open('model.pkl', 'rb'))

In [28]:
def fake_news_det1(news):
    input_data = [news]
    vectorized_input_data = tfvect.transform(input_data)
    prediction = loaded_model.predict(vectorized_input_data)
    print(prediction)

In [29]:
fake_news_det1('This article is part of a series aimed at helping you navigate lifeâ€™s opportunities and challenges. What else should we write about? Contact us:')

[1]
