## Importing all the dependecies

## Explanation of each of the imports

### Pandas - used to read the given data set 
### Numpy - for matrix multiplications and other math related operations
### re (Regular Expression) - used for searching text in a paragraph
### nltk.stem.porter import PorterStemmer - it is a natural language toolkit, which removes the prefix and suffix of a word and gives us the root word
### from sklearn.feature_extraction.text import TfidfVectorizer - This is used for creating feature vectors from text (numerical)
### nltk.corpus import stopwords- used to remove non-useful words such as a, an, the and etc etc 
### rest all are sckit learn imports for performing logistic regression on the model

In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics import accuracy_score
import nltk

## Getting all the stopwords (The words that wont add much value to our model)

In [2]:
nltk.download("stopwords")
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aayushmalaviya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_df = pd.read_csv("train.csv")
train_df.shape
train_df.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
train_df["label"].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

In [5]:
## finding the missing values in the dataset
train_df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
## since the number if not available values is still less as compared to the available data we will drop those rows or fill those with an empty string
train_df=train_df.fillna('')## this function fills all null values with empty strings 
train_df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [7]:
## combining title and author so that we can apply processing on this at the same time
train_df["content"] =  train_df["author"] + " " + train_df["title"] 
train_df.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Howard Portnoy Iranian woman jailed for fictio...


In [8]:
## creating a feature array
X = train_df.drop("label",axis = 1)
Y = train_df["label"]
X.head()

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss 15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy Iranian woman jailed for fictio...


## Stemming - process of removing all prefixes and suffixes of all the words and just keeping the root word for processing 

In [9]:
port_stem = PorterStemmer()

In [10]:
def stemming(content):
    content_new = re.sub('[^a-zA-Z]',' ', content) ## this allows only letters in the paragraph content to be in the content new anything else gets replaced by a space
    content_new = content_new.lower() #converts all the letters to lowercase
    content_new = content_new.split() #creating a list of all the words
    content_new = [port_stem.stem(word) for word in content_new if not word in stopwords.words('english')]## stemming words that are useful
    content_new = " ".join(content_new) ## creating a new content that has all the important root words
    return content_new

In [11]:
train_df["content"] = train_df["content"].apply(stemming) ## applies the stemmming function to that column

## vectorization of the content column so that it can be used for training the model


In [12]:
vector1 = TfidfVectorizer()
vector1.fit(train_df["content"])## essentially looks for repeated words in the articles and tries to give it an importance number
X = vector1.transform(train_df["content"])
X.shape

(20800, 17128)

In [13]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,stratify = Y,random_state=1)
model = LogisticRegression(max_iter = 1000) ##choosing a logistic regression model to perform the classification
model.fit(X_train,Y_train)
pred = model.predict(X_train)


### Training dataset accuracy 

In [14]:
accuracy = accuracy_score(pred,Y_train)
print(accuracy)

0.9868389423076923


### test data set - good to see if the function is overfitting or is it general

In [15]:
pred1 = model.predict(X_test)
accuracy1 = accuracy_score(pred1,Y_test)
print(accuracy1)

0.9766826923076923


## Model accuracy is 97.67%

## Running this on our actual testing data 

In [161]:
df1 = pd.read_csv('test.csv')
df1.head()
df1 = df1.fillna(" ")
df1.isnull().sum()

id        0
title     0
author    0
text      0
dtype: int64

In [162]:
df1["content"] = df1["author"] +' '+ df1["title"]
df1['content'] =df1["content"].apply(stemming)


In [172]:
XZ = vector1.transform(df1["content"])## no need to train the tfidf again or else the number of features will decrease
pred3 = model.predict(XZ)
df1["label"] = pred3
df1 = df1.drop(columns = "content",axis =1)
df1.head()


Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,1
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",0
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1
