In [4]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhanu\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [6]:
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [12]:
news_data = pd.read_csv('news_data.csv')
news_data.head(190)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
...,...,...,...,...
185,185,,Cool,1
186,186,Trump's daughter Ivanka joined call with Argen...,"BUENOS AIRES (Reuters) - Ivanka Trump, daughte...",0
187,187,"Trump Sons Forge Ahead Without Father, Expandi...",President Trump’s old office on the 26th floor...,0
188,188,Jack Heart: LUCIFER in the Temple of the Dog,LUCIFER in the Temple of the Dog I By Jack Hea...,1


In [13]:
news_data.shape

(72134, 4)

In [16]:
news_data.isnull().sum()
news_data = news_data.dropna()

In [20]:
news_data.isnull().sum()
news_data.shape

(71537, 4)

In [19]:
news_data.drop_duplicates(inplace=True)
news_data.shape

(71537, 4)

In [22]:
news_data.drop('Unnamed: 0', axis=1, inplace=True)
news_data.head(10)

Unnamed: 0,title,text,label
0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
6,DR BEN CARSON TARGETED BY THE IRS: “I never ha...,DR. BEN CARSON TELLS THE STORY OF WHAT HAPPENE...,1
7,HOUSE INTEL CHAIR On Trump-Russia Fake Story: ...,,1
8,Sports Bar Owner Bans NFL Games…Will Show Only...,"The owner of the Ringling Bar, located south o...",1
9,Latest Pipeline Leak Underscores Dangers Of Da...,"FILE – In this Sept. 15, 2005 file photo, the ...",1
10,GOP Senator Just Smacked Down The Most Puncha...,The most punchable Alt-Right Nazi on the inter...,1


In [30]:
print((news_data['text']==' ').sum())
print((news_data['title']==' ').sum())

738
0


In [32]:
news_data.drop(news_data[news_data['text']==' '].index, inplace=True)
news_data.shape


(70799, 3)

In [35]:
news_data.info()


<class 'pandas.core.frame.DataFrame'>
Index: 70799 entries, 0 to 72133
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   70799 non-null  object
 1   text    70799 non-null  object
 2   label   70799 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.2+ MB


In [36]:
X = news_data['title']
Y = news_data['label']

In [37]:
X.head(10)

0     LAW ENFORCEMENT ON HIGH ALERT Following Threat...
2     UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...
3     Bobby Jindal, raised Hindu, uses story of Chri...
4     SATAN 2: Russia unvelis an image of its terrif...
5     About Time! Christian Group Sues Amazon and SP...
6     DR BEN CARSON TARGETED BY THE IRS: “I never ha...
8     Sports Bar Owner Bans NFL Games…Will Show Only...
9     Latest Pipeline Leak Underscores Dangers Of Da...
10     GOP Senator Just Smacked Down The Most Puncha...
11    May Brexit offer would hurt, cost EU citizens ...
Name: title, dtype: object

In [38]:
Y.head(10)

0     1
2     1
3     0
4     1
5     1
6     1
8     1
9     1
10    1
11    0
Name: label, dtype: int64

In [None]:
# Stemming process

