In [None]:
## About the Dataset:

#id: unique id for a news article
#title: the title of a news article
#author: author of the news article
#text: the text of the article; could be incomplete
#label: a label that marks whether the news article is real or fake with:
    #1: Fake news
    #0: real News


In [32]:
## Importing necessary libraries
## pandas to import, clean and visualise data
import pandas as pd
## numpy for numerical operations
import numpy as np
## re module for working with regular expressions
import re
## ntlk: Natural Language Toolkit, Python library for natural language processing (NLP) and text analysis
import nltk
## Stopwords words that are very common and do not carry much meaningful information
from nltk.corpus import stopwords
## Porter Stemmer is used for stemming, the process of reducing a word to its base or root form by removing suffixes.
## lemmatization, the process of reducing a word to its base or dictionary form, considering the context and part of speech of the word to generate valid words.
from nltk.stem import PorterStemmer, WordNetLemmatizer
## Measures how frequently a term appears in a document.
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [16]:
## printing different stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# 1. Data Gathering and analysis

In [17]:
## importing our training dataset
df= pd.read_csv("./data/train.csv")

In [18]:
## printing the first 5 lines of data
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [19]:
## returns the number of rows and number of cols
df.shape

(20800, 5)

In [20]:
## gives a summary abt the DataFrame, inc the number of non-null (non-missing) values, the data types, and memory usage
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [21]:
## counts the occurences of each unique value in the col label
df['label'].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

In [22]:
## count the number of null/missing values
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [23]:
## remove the missing values
df = df.dropna()

In [41]:
## checking the updated numb of rows and cols after removing the null vals
df.shape

(18285, 3)

In [46]:
df.head()

Unnamed: 0,level_0,index,title,label
0,0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,2,2,Why the Truth Might Get You Fired,1
3,3,3,15 Civilians Killed In Single US Airstrike Hav...,1
4,4,4,Iranian woman jailed for fictional unpublished...,1


In [47]:
## return the first title
df['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [26]:
## drop the id, text, and author 
## axis= 1: for removing cols and axis=0 for removing rows
df = df.drop(['id','text','author'],axis = 1)
df.head()

Unnamed: 0,index,title,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0
2,2,Why the Truth Might Get You Fired,1
3,3,15 Civilians Killed In Single US Airstrike Hav...,1
4,4,Iranian woman jailed for fictional unpublished...,1


# 2. Data Preprocessing


In [48]:
## just giving a sample of how to process text data
sample_data = 'The quick brown fox jumps over the lazy dog'
sample_data = sample_data.split()
sample_data

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [28]:
## lowering each letter in the sample data
sample_data = [data.lower() for data in sample_data]
sample_data

['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']

In [49]:
## printing 10 stopwords in eng and assigning them to a list named stopwords
stopwords = stopwords.words('english')
print(stopwords[0:10])
print(len(stopwords))


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
179


In [30]:
## printing sample data without stopwords
sample_data = [data for data in sample_data if data not in stopwords]
print(sample_data)
len(sample_data)

['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']


6

In [33]:
## As we said earlier stemming is removing suffixes and prefixes and returning the base form of words
## sometimes we might end up with words not present in the dict
ps = PorterStemmer()
sample_data_stemming = [ps.stem(data) for data in sample_data]
print(sample_data_stemming)

['quick', 'brown', 'fox', 'jump', 'lazi', 'dog']


In [34]:
## lemmatizing similar to stemming but we end up with valid words( present in dict)
## WordNet large lexical database of the English language, contains information about words, including their meanings, relationships to other words, and POS information. 
## The WordNet Lemmatizer uses this DB to identify the lemma or base form of a word based on its POS
lm = WordNetLemmatizer()
sample_data_lemma = [lm.lemmatize(data) for data in sample_data]
print(sample_data_lemma)


['quick', 'brown', 'fox', 'jump', 'lazy', 'dog']


In [50]:
lm = WordNetLemmatizer()
corpus = []
for i in range (len(df)):
    ## removes all characters that are not letters or digits from the text in title and replaces it with a space ''.
    review = re.sub('^a-zA-Z0-9',' ', df['title'][i])
    ## make it lowercase
    review = review.lower()
    ## split it into a list of words
    review = review.split()
    ## lemmatize each word in review using wordnetlemmatizer and remove stopwords for the list
    review = [lm.lemmatize(x) for x in review if x not in stopwords]
    ## join the words in the list to form a string of text
    review = " ".join(review)
    ## the cleaned and prepocessed text is now appended to the corpus list
    corpus.append(review)

In [52]:
len(corpus)

18285

In [53]:
df['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [54]:
## Term Frequency-Inverse Document Frequency, or TF-IDF, is a numerical statistic and a text analysis technique 
## designed to evaluate the importance of a term (word or phrase) within a document 
## TF-IDF helps quantifying how relevant a term is in a specific context, considering its frequency within a document 
## TfidfVectorizer() converts the corpus into a matrix of TF-IDF features.
## x = tf.fit_transform(corpus).toarray(): This line fits the TF-IDF vectorizer to the text data in the corpus and transforms it into a matrix of TF-IDF features
## x contains the TF-IDF features, which contains the importance of each term within the corpus.
tf = TfidfVectorizer()
x = tf.fit_transform(corpus).toarray()
x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [55]:
## friendly reminder 0 represents real data and 1 fake ones :) 
y = df['label']
y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64