In [1]:
import pandas as pd
import numpy as np

In [2]:
import re #useful to search text in a document
from nltk.corpus import stopwords # corpus means body/content of the text (nltk = natural language tool kit)
# stopwords means those words that dont add much value to text 
from nltk.stem.porter import PorterStemmer # stemming take a word removes prefix and suffix acting=>act and like that it extracts the root word
from sklearn.feature_extraction.text import TfidfVectorizer # to convert text to featureVectors (featureVectors are like numbers)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Aayush
[nltk_data]     Bhagat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Lets see these stopwords

In [4]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

The above words don't add much value and during stemming we will remove them

importing the dependencies

loading the dataset

In [5]:
data = pd.read_csv('./dataset/train.csv/train.csv')

In [6]:
data.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


label is the mark for news true or false 1 = fake 0 = real news

In [7]:
data.shape

(20800, 5)

In [8]:
data.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [9]:
data.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

We need to fix this null values in title author and text 

In [10]:
# we need to consider all possible combinations of these 
# either we can drop or fill it with a null string but there are too many misssing values so we can't just drop them

#replacing with a null values with a empty string ''
data = data.fillna('')

In [11]:
data.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

Now we will combine title and author together for processing cuz text has a really long data

In [12]:
#merging author and news title
data['content'] = data['author']+' '+data['title']
data.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


Now lets seperate content and label columns

In [13]:
X = data.drop(columns='label',axis=1) # we want to remove the label col from data and store in one var and other var will have other datastuff
# to remove col axis=1 and if row then axis = 2
Y = data['label'] # label loaded here

In [14]:
# print(X)
X.head()

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy Iranian woman jailed for fictio...


In [15]:
# print(Y)
Y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

# Now we will do Stemming 


we need to find the root words like
acting , actor etc should be reduced to -> act

In [16]:
port_stem = PorterStemmer() #loading the imported function to a variable

In [17]:
# making a function
def stemming(content):
    # call the regular expression lib to search paragraph or text and here sub means substitute for certain values
    # extracting only alphabets[^a-zA-Z^] numbers , and other will be replaced by ' '(space)
    # and all this above process of text cleaning will be done for content
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower() #converting to lowercases
    stemmed_content = stemmed_content.split() #spliting them using delimiter (spliting to their respected lists)
    print(stemmed_content)
    # running the loop for main stemming and excluding or removing unnescessary words with the help of stopwords
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    # now we need to join the words
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

Apply the above function to the content column

In [18]:
# look at the content before
data.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


In [19]:
data['content'] = data['content'].apply(stemming)

['darrell', 'lucus', 'house', 'dem', 'aide', 'we', 'didn', 't', 'even', 'see', 'comey', 's', 'letter', 'until', 'jason', 'chaffetz', 'tweeted', 'it']
['daniel', 'j', 'flynn', 'flynn', 'hillary', 'clinton', 'big', 'woman', 'on', 'campus', 'breitbart']
['consortiumnews', 'com', 'why', 'the', 'truth', 'might', 'get', 'you', 'fired']
['jessica', 'purkiss', 'civilians', 'killed', 'in', 'single', 'us', 'airstrike', 'have', 'been', 'identified']
['howard', 'portnoy', 'iranian', 'woman', 'jailed', 'for', 'fictional', 'unpublished', 'story', 'about', 'woman', 'stoned', 'to', 'death', 'for', 'adultery']
['daniel', 'nussbaum', 'jackie', 'mason', 'hollywood', 'would', 'love', 'trump', 'if', 'he', 'bombed', 'north', 'korea', 'over', 'lack', 'of', 'trans', 'bathrooms', 'exclusive', 'video', 'breitbart']
['life', 'life', 'of', 'luxury', 'elton', 'john', 's', 'favorite', 'shark', 'pictures', 'to', 'stare', 'at', 'during', 'long', 'transcontinental', 'flights']
['alissa', 'j', 'rubin', 'beno', 't', 'ha

In [21]:
# print(data['content'])
data['content'].head()

0    darrel lucu hous dem aid even see comey letter...
1    daniel j flynn flynn hillari clinton big woman...
2               consortiumnew com truth might get fire
3    jessica purkiss civilian kill singl us airstri...
4    howard portnoy iranian woman jail fiction unpu...
Name: content, dtype: object