# News Classification Using Natural Language Processing By Ashmit Sharma

In [1]:
import nltk
import pandas as pd

In [2]:
#punkt dataset is used by NLTK for Tokenization
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

In [3]:
fake = pd.read_csv("Fake.csv")
authentic = pd.read_csv("True.csv")

In [4]:
fake.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [5]:
authentic.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [6]:
#Adding a column 'Genuineness' to both fake and authentic dataframes
fake["Genuineness"] = 0
authentic["Genuineness"] = 1

In [7]:
#Concatenating the fake and authentic data along the rows i.e axis=0
news_df = pd.concat([fake, authentic], axis=0)

In [8]:
#Resetting the index for the news_df dataframe and dropping the old index
news_df = news_df.reset_index(drop=True)

In [9]:
#Dropping or Removing the title and subject column from the dataframe as they are not required for text classification task
news_df = news_df.drop(['title','subject'], axis=1)

In [10]:
#Importing the word_tokenize function from NLTK and tokenizing the text column to split the text into a list of words
from nltk.tokenize import word_tokenize
news_df['text'] = news_df['text'].apply(word_tokenize)

In [11]:
#Importing the Snowball stemmer from NLTK and 
#Create an instance of it for stemming words in the English language, while not ignoring stopwords.
from nltk.stem.snowball import SnowballStemmer
sb = SnowballStemmer("english", ignore_stopwords=False)

In [12]:
#Stem each word in text
def stem_it(text):
    return [sb.stem(word) for word in text]
news_df['text'] = news_df['text'].apply(stem_it)

In [13]:
#Remove words whose length is less than or equal to 2
def stopword_remover(text):
    return[word for word in text if len(word)>2]

In [14]:
#Converting the stemmed words to space separated sentences
news_df['text'] = news_df['text'].apply(' '.join)

In [15]:
from sklearn.model_selection import train_test_split
X = news_df['text']
Y = news_df['Genuineness']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=2)

In [16]:
X_train

30569    beirut ( reuter ) - extend u.s. sanction on ir...
10965    polic on tuesday arrest two peopl and were see...
16754    sinc this articl was written in 2006 , the ill...
43980    london ( reuter ) - two men arrest on suspicio...
23519    new york ( reuter ) - a feder judg in new york...
                               ...                        
44566    xiamen , china ( reuter ) - the nation of the ...
30280    baton roug , la./washington ( reuter ) - u.s. ...
6637     after the west wing wrap up it final season in...
35343    brussel ( reuter ) - the european union s exec...
23720    ( reuter ) - presid donald trump ’ s administr...
Name: text, Length: 33673, dtype: object

In [17]:
#Transforming the text data to feature vectors to use as input to our ML model
from sklearn.feature_extraction.text import TfidfVectorizer
feature_extraction = TfidfVectorizer(max_df=0.7)

X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [18]:
#Training the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model1 = LogisticRegression(max_iter=900)
model1.fit(X_train_features,Y_train)

In [19]:
#Accuracy score for training data in Model 1
training_data_prediction = model1.predict(X_train_features)
training_data_prediction_accuracy = accuracy_score(Y_train, training_data_prediction)
training_data_prediction_accuracy

0.9927241410031776

In [20]:
#Accuracy score for testing data in Model 1
testing_data_prediction = model1.predict(X_test_features)
testing_data_prediction_accuracy = accuracy_score(Y_test, testing_data_prediction)
testing_data_prediction_accuracy

0.9885968819599109

In [22]:
#Trying the same with Model 2 i.e Passive Aggressive Classifier
from sklearn.linear_model import PassiveAggressiveClassifier
model2 = PassiveAggressiveClassifier(max_iter=100)
model2.fit(X_train_features,Y_train)

In [23]:
#Accuracy score for training data in Model 2
training_data_prediction = model2.predict(X_train_features)
training_data_prediction_accuracy = accuracy_score(Y_train, training_data_prediction)
training_data_prediction_accuracy

0.9999703026163395

In [24]:
#Accuracy score for testing data in Model 2
testing_data_prediction = model2.predict(X_test_features)
testing_data_prediction_accuracy = accuracy_score(Y_test, testing_data_prediction)
testing_data_prediction_accuracy

0.996347438752784

In [34]:
def fake_or_not():
    input_news = input("Enter the news: ")
    news_df = feature_extraction.transform([input_news])
    news_prediction = model1.predict(news_df)

    print("\nThe Entered News is: ")
    if (news_prediction[0] == 0):
        print("Fake News")
    else:
        print("Authentic News")

In [37]:
fake_or_not()

Enter the news: Babar's Pakistan side is set to meet Jos Buttler's England in their final league fixture of the ICC World Cup on Saturday. A win by a narrow margin would have been enough for Pakistan to qualify if Sri Lanka had beaten New Zealand. If New Zealand had wrapped up the contest in 40 overs, Pakistan would have only gotten ahead of New Zealand on the basis of Net-Run-Rate (NRR) with a win over England by around 183 runs (Assuming Pakistan crossed the 300-run barrier).

The Entered News is: 
Authentic News
