In [125]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Reading True and Fake News files and merging them

In [126]:
df_true = pd.read_csv('TrueNews.csv')
df_fake = pd.read_csv('FakeNews.csv')
df_true['label'] = 1
df_fake['label'] = 0
df = pd.concat([df_true, df_fake], ignore_index=True)

Randomizing the data

In [127]:
df = df.sample(frac=1, random_state=0).reset_index(drop=True)
print(df.head())
print(df.tail())

                                               title  \
0  RUBIO Sides With Democrats On Giving A Whoppin...   
1   Ted Cruz Says Endorsing Trump Is A ‘Grave Mis...   
2   ‘Responsible Gun Owner’ Road Rages, Threatens...   
3  TEACHER’S UNION PROTEST Gets Ugly As Protester...   
4  Etihad advises checks with U.S. missions after...   

                                                text          subject  \
0  Democrats want to spend a whopping $2 billion ...         politics   
1  Republican presidential candidate and Texas Se...             News   
2  The National Rifle Association tells us that a...             News   
3  In case you don t live in the Midwest, you mig...  Government News   
4  DUBAI (Reuters) - Etihad Airways is advising s...     politicsNews   

               date  label  
0      May 16, 2016      0  
1     March 1, 2016      0  
2  February 2, 2016      0  
3      Jan 20, 2016      0  
4    March 7, 2017       1  
                                                  

Dropping and null rows and date column (as it's not a determinant whether a news a fake or real)

In [128]:
df = df.dropna()
print(df.shape)
df = df.drop(['date'], axis=1)
print(df.shape)

(44898, 5)
(44898, 4)


### Pre Processing of Data

Small-casing all data:

In [129]:
df = df.apply(lambda x: x.astype(str).str.lower())
print(df.head())

                                               title  \
0  rubio sides with democrats on giving a whoppin...   
1   ted cruz says endorsing trump is a ‘grave mis...   
2   ‘responsible gun owner’ road rages, threatens...   
3  teacher’s union protest gets ugly as protester...   
4  etihad advises checks with u.s. missions after...   

                                                text          subject label  
0  democrats want to spend a whopping $2 billion ...         politics     0  
1  republican presidential candidate and texas se...             news     0  
2  the national rifle association tells us that a...             news     0  
3  in case you don t live in the midwest, you mig...  government news     0  
4  dubai (reuters) - etihad airways is advising s...     politicsnews     1  


Removing any extra spaces or links or special characters from data

In [130]:
df['title'] = df['title'].str.replace('[^\w\s]','')
df['title'] = df['title'].str.replace('http\S+|www.\S+', '', case=False)
df['title'] = df['title'].str.replace('[^a-zA-Z]', ' ')

  df['title'] = df['title'].str.replace('[^\w\s]','')
  df['title'] = df['title'].str.replace('http\S+|www.\S+', '', case=False)
  df['title'] = df['title'].str.replace('[^a-zA-Z]', ' ')


In [131]:
df['text'] = df['text'].str.replace('[^\w\s]','')
df['text'] = df['text'].str.replace('http\S+|www.\S+', '', case=False)
df['text'] = df['text'].str.replace('[^a-zA-Z]', ' ')

  df['text'] = df['text'].str.replace('[^\w\s]','')
  df['text'] = df['text'].str.replace('http\S+|www.\S+', '', case=False)
  df['text'] = df['text'].str.replace('[^a-zA-Z]', ' ')


In [132]:
print(df.head())

                                               title  \
0  rubio sides with democrats on giving a whoppin...   
1   ted cruz says endorsing trump is a grave mist...   
2   responsible gun owner road rages threatens to...   
3  teachers union protest gets ugly as protesters...   
4  etihad advises checks with us missions after n...   

                                                text          subject label  
0  democrats want to spend a whopping   billion o...         politics     0  
1  republican presidential candidate and texas se...             news     0  
2  the national rifle association tells us that a...             news     0  
3  in case you don t live in the midwest you migh...  government news     0  
4  dubai reuters  etihad airways is advising some...     politicsnews     1  


Tokenizing the data

In [133]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
df['title'] = df['title'].apply(lambda x: tokenizer.tokenize(x))
df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x))
print(df.head())

                                               title  \
0  [rubio, sides, with, democrats, on, giving, a,...   
1  [ted, cruz, says, endorsing, trump, is, a, gra...   
2  [responsible, gun, owner, road, rages, threate...   
3  [teachers, union, protest, gets, ugly, as, pro...   
4  [etihad, advises, checks, with, us, missions, ...   

                                                text          subject label  
0  [democrats, want, to, spend, a, whopping, bill...         politics     0  
1  [republican, presidential, candidate, and, tex...             news     0  
2  [the, national, rifle, association, tells, us,...             news     0  
3  [in, case, you, don, t, live, in, the, midwest...  government news     0  
4  [dubai, reuters, etihad, airways, is, advising...     politicsnews     1  


Lemmatization of data

In [134]:
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['title'] = df['title'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])
print(df.head())

                                               title  \
0  [rubio, side, with, democrat, on, giving, a, w...   
1  [ted, cruz, say, endorsing, trump, is, a, grav...   
2  [responsible, gun, owner, road, rage, threaten...   
3  [teacher, union, protest, get, ugly, a, protes...   
4  [etihad, advises, check, with, u, mission, aft...   

                                                text          subject label  
0  [democrat, want, to, spend, a, whopping, billi...         politics     0  
1  [republican, presidential, candidate, and, tex...             news     0  
2  [the, national, rifle, association, tell, u, t...             news     0  
3  [in, case, you, don, t, live, in, the, midwest...  government news     0  
4  [dubai, reuters, etihad, airway, is, advising,...     politicsnews     1  


Removal of Stop words

In [135]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['title'] = df['title'].apply(lambda x: [item for item in x if item not in stop])
df['text'] = df['text'].apply(lambda x: [item for item in x if item not in stop])
print(df.head())

                                               title  \
0  [rubio, side, democrat, giving, whopping, bill...   
1  [ted, cruz, say, endorsing, trump, grave, mist...   
2  [responsible, gun, owner, road, rage, threaten...   
3  [teacher, union, protest, get, ugly, protester...   
4  [etihad, advises, check, u, mission, new, trum...   

                                                text          subject label  
0  [democrat, want, spend, whopping, billion, zik...         politics     0  
1  [republican, presidential, candidate, texas, s...             news     0  
2  [national, rifle, association, tell, u, need, ...             news     0  
3  [case, live, midwest, might, noticed, fight, c...  government news     0  
4  [dubai, reuters, etihad, airway, advising, pas...     politicsnews     1  


Making another data frame with topic and text columns merged

In [136]:
df2 = df.copy()
df2['title'] = df2['title'].apply(lambda x: ' '.join(x))
df2['text'] = df2['text'].apply(lambda x: ' '.join(x))
df2['title_text'] = df2['title'] + ' ' + df2['text']
print(df2.head())

                                               title  \
0  rubio side democrat giving whopping billion zi...   
1  ted cruz say endorsing trump grave mistake pro...   
2  responsible gun owner road rage threatens shoo...   
3  teacher union protest get ugly protester cop d...   
4     etihad advises check u mission new trump order   

                                                text          subject label  \
0  democrat want spend whopping billion zika viru...         politics     0   
1  republican presidential candidate texas senato...             news     0   
2  national rifle association tell u need ensure ...             news     0   
3  case live midwest might noticed fight continue...  government news     0   
4  dubai reuters etihad airway advising passenger...     politicsnews     1   

                                          title_text  
0  rubio side democrat giving whopping billion zi...  
1  ted cruz say endorsing trump grave mistake pro...  
2  responsible gun owne

Functions to convert data to numerics using TFIDF and BOW

In [137]:
def to_vector_Tfidf(df, col):
    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(max_features=2000)
    vectorizer.fit(df[col])
    return vectorizer.transform(df[col])

def to_vector_bow(df, col):
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(max_features=2000)
    vectorizer.fit(df[col])
    return vectorizer.transform(df[col])

In [152]:
title_vector_tfidf = to_vector_Tfidf(df2, 'title')
text_vector_tfidf = to_vector_Tfidf(df2, 'text')
title_text_vector_tfidf = to_vector_Tfidf(df2, 'title_text')
title_vector_tfidf = title_vector_tfidf.toarray()
text_vector_tfidf = text_vector_tfidf.toarray()
title_text_vector_tfidf = title_text_vector_tfidf.toarray()

  (0, 1996)	0.4629419088058745
  (0, 1634)	0.4478063918308697
  (0, 1537)	0.409062487450288
  (0, 747)	0.44298578614975054
  (0, 483)	0.28177452318181667
  (0, 185)	0.37707932484096945
  (1, 1893)	0.22660073174650244
  (1, 1841)	0.1838503253174762
  (1, 1769)	0.5136887993459774
  (1, 1558)	0.2887695769171622
  (1, 1391)	0.5760965322164531
  (1, 431)	0.4854893414837819
  (2, 1893)	0.14335066558909626
  (2, 1798)	0.33777492844498125
  (2, 1620)	0.3743912245160185
  (2, 1527)	0.40172570833334936
  (2, 1499)	0.4051582246210042
  (2, 1429)	0.42945116825303636
  (2, 1281)	0.3582278769821827
  (2, 782)	0.2947840665970809
  (3, 1893)	0.15761364184178936
  (3, 1864)	0.3998080945150127
  (3, 1858)	0.47218031631323404
  (3, 1765)	0.4110980882863733
  (3, 1404)	0.34403542120714553
  :	:
  (44893, 1242)	0.24875230474013843
  (44893, 440)	0.3738612740911522
  (44893, 142)	0.3216312163792396
  (44894, 1861)	0.3394236290600913
  (44894, 1814)	0.34550222731762537
  (44894, 1634)	0.4758590376607832
  (4

In [147]:
title_vector_bow = to_vector_bow(df2, 'title')
text_vector_bow = to_vector_bow(df2, 'text')
title_text_vector_bow = to_vector_bow(df2, 'title_text')
title_vector_bow = title_vector_bow.toarray()
text_vector_bow = text_vector_bow.toarray()
title_text_vector_bow = title_text_vector_bow.toarray()

In [148]:
print("Shapes of title_vector_tfidf, text_vector_tfidf, title_text_vector_tfidf: ", title_vector_tfidf.shape, text_vector_tfidf.shape, title_text_vector_tfidf.shape)
print("Shapes of title_vector_bow, text_vector_bow, title_text_vector_bow: ", title_vector_bow.shape, text_vector_bow.shape, title_text_vector_bow.shape)

Shapes of title_vector_tfidf, text_vector_tfidf, title_text_vector_tfidf:  (44898, 2000) (44898, 2000) (44898, 2000)
Shapes of title_vector_bow, text_vector_bow, title_text_vector_bow:  (44898, 2000) (44898, 2000) (44898, 2000)


In [159]:
subject = df['subject'].to_numpy()
subject = subject.reshape(-1, 1)
print(subject.shape)
labels = df['label'].to_numpy()
labels = labels.reshape(-1, 1)
print(labels.shape)

(44898, 1)
(44898, 1)
