In [1]:
import numpy as np
import pandas as pd
import re
import itertools
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import PassiveAggressiveClassifier
#from sklearn.metrics import accuracy_score, confusion_matrix

import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
from nltk import sent_tokenize, word_tokenize



[nltk_data] Downloading package punkt to /home/femme_js/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/femme_js/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/femme_js/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#Reading the data
df=pd.read_csv('/home/femme_js/news.csv')


#getting the shape and head
df.shape
df.head(10)



Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL


In [3]:
# checking if column have nan values

check_nan_in_df = df.isnull()
print (check_nan_in_df)




      Unnamed: 0  title   text  label
0          False  False  False  False
1          False  False  False  False
2          False  False  False  False
3          False  False  False  False
4          False  False  False  False
...          ...    ...    ...    ...
6330       False  False  False  False
6331       False  False  False  False
6332       False  False  False  False
6333       False  False  False  False
6334       False  False  False  False

[6335 rows x 4 columns]


In [4]:
# as data dont have any NaN value, we dont need to fill them

In [5]:
#Getting the Labels

labels=df.label
labels.head()

0    FAKE
1    FAKE
2    REAL
3    FAKE
4    REAL
Name: label, dtype: object

In [6]:
#Fill all null spaces with and make a new freature called total

# Firstly, fill all the null spaces with a space
df = df.fillna(' ')
df['total'] = df['title'] + ' ' + df['text']

df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,total
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,You Can Smell Hillary’s Fear Daniel Greenfield...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,Kerry to go to Paris in gesture of sympathy U....
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,The Battle of New York: Why This Primary Matte...


In [7]:
#PRE-PROCESSING THE DATA


In [8]:

stop_words = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

for index, row in df.iterrows():
    filter_sentence = ''
    sentence = row['total']
    # Cleaning the sentence with regex
    sentence = re.sub(r'[^\w\s]', '', sentence)
    # Tokenization
    words = nltk.word_tokenize(sentence)
    # Stopwords removal
    words = [w for w in words if not w in stop_words]
    # Lemmatization
    for words in words:
        filter_sentence = filter_sentence  + ' ' + str(lemmatizer.lemmatize(words)).lower()
    
    df.loc[index, 'total'] = filter_sentence

df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label,total
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,you can smell hillarys fear daniel greenfield...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,watch the exact moment paul ryan committed po...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,kerry go paris gesture sympathy us secretary ...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,bernie supporter twitter erupt anger dnc we t...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,the battle new york why this primary matters ...


In [15]:
df['total'].head()

0     you can smell hillarys fear daniel greenfield...
1     watch the exact moment paul ryan committed po...
2     kerry go paris gesture sympathy us secretary ...
3     bernie supporter twitter erupt anger dnc we t...
4     the battle new york why this primary matters ...
Name: total, dtype: object

In [16]:
#Spiliting the dataset into training and testing

x_train,x_test,y_train,y_test=train_test_split(df['total'], labels, test_size=0.2, random_state=7)

x_train.head(10)

6237     shock claim nibiru will hit in 2017 world gov...
3722     spirit thank veteran annual veterans day para...
5774     obese woman loses more than 100 kilograms aft...
336      im glad the surprising palestinian reaction n...
3622     john kasich wooing iowans mostly new hampshir...
4024     obama administration mandate ordering sex cha...
1537     tesla worlds safest car explodes like bomb te...
5200     why hillary clinton unlikely indicted private...
279      american express disowns pink floyd singer ro...
4973     comment why were all deficient in magnesium t...
Name: total, dtype: object

In [17]:
y_train.head()

6237    FAKE
3722    FAKE
5774    FAKE
336     REAL
3622    REAL
Name: label, dtype: object