# Fake News Classification

In [35]:
import tensorflow as tf
from tensorflow import keras
import numpy as np 
import pandas as pd
from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings("ignore")

In [36]:
fake_df = pd.read_csv("datasets/train.csv")
fake_df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [37]:
fake_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [7]:
fake_df.shape

(20800, 5)

In [8]:
fake_df["label"].value_counts()

1    10413
0    10387
Name: label, dtype: int64

In [24]:
# Which author names have fake news label
fake_df[fake_df["label"] ==1].groupby("author").sum().head()

Unnamed: 0_level_0,id,label
author,Unnamed: 1_level_1,Unnamed: 2_level_1
# 1 NWO Hatr,195315,17
-NO AUTHOR-,603031,54
10 Habits That Will Make Your Life Easier &amp; More Peaceful - Wellness Solutions,2933,1
"10 More Beautiful Images That Remind You We Still Live In A Beautiful World, With Beautiful People - Upside Down Media",18384,1
10 Movies That Could Change Your Understanding Of Life - Upside Down Media,4064,1


In [25]:
# Which types of headlines are fake
fake_df[fake_df["label"] ==1].groupby("title").sum().head()

Unnamed: 0_level_0,id,label
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"""Allahu Akbar, the Russians are here!"": Aleppo terrorists flee Russian airstrike (VIDEO)",13212,1
"""Authoritarianism"": How the West demonizes strong, popular leaders",590,1
"""Blue Alerts"" to be used to keep the 'War on Cops' lie, alive",1602,1
"""Democracy Is Coming to the USA"" - Russia News Now",14703,1
"""Donald, willst du mich heiraten?"": Clinton hat Kampf ums Weiße Haus noch nicht aufgegeben",5541,1


In [26]:
y = fake_df["label"]

In [49]:
# Clean up the author, text and title the same way. Link author first and second names with an underscore
fake_df['title_lower'] = fake_df["title"].str.lower()
fake_df['title_no_punctuation'] = fake_df['title_lower'].str.replace('[^\w\s]','')
fake_df['title_no_punctuation'] = fake_df["title_no_punctuation"].fillna("fillna")

fake_df['text_lower'] = fake_df["text"].str.lower()
fake_df['text_no_punctuation'] = fake_df['text_lower'].str.replace('[^\w\s]','')
fake_df['text_no_punctuation'] = fake_df["text_no_punctuation"].fillna("fillna")

fake_df['author_lower'] = fake_df["author"].str.lower()
fake_df['author_no_spaces'] = fake_df['author_lower'].str.replace(' ','_')

In [43]:
# Get mean length of text etc
fake_df["text"].str.len().describe()

count     20761.000000
mean       4552.715380
std        5130.563491
min           1.000000
25%        1628.000000
50%        3361.000000
75%        6275.000000
max      142961.000000
Name: text, dtype: float64

In [50]:
fake_df.columns

Index(['id', 'title', 'author', 'text', 'label', 'title_lower',
       'title_no_punctuation', 'text_lower', 'text_no_punctuation',
       'author_lower', 'author_no_spaces'],
      dtype='object')

In [51]:
tok = preprocessing.text.Tokenizer(num_words=max_features)
tok.fit_on_texts(list(fake_df['text_no_punctuation'])+list(fake_df['title_no_punctuation'])+list(fake_df['author_no_spaces'].astype(str)))

In [52]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1

216068


In [53]:
# Set up the padded sequences for each dataset (author, title and text)
text_df = tok.texts_to_sequences(list(fake_df['text_no_punctuation'])) 
text_df = preprocessing.sequence.pad_sequences(text_df, maxlen=maxlen) 

title_df = tok.texts_to_sequences(list(fake_df['title_no_punctuation']))
title_df = preprocessing.sequence.pad_sequences(title_df, maxlen=maxlen)

author_df = tok.texts_to_sequences(list(fake_df['author_no_spaces'].astype(str))) 
author_df = preprocessing.sequence.pad_sequences(author_df, maxlen=maxlen)

In [60]:
# Set up training sets and uncomment to try each
train_df = author_df 

In [61]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.1, random_state=42)

In [62]:
# Set up model
embedding_dim = 100
max_features=6000 
maxlen=1000 
vocab_size = vocab_size

model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = maxlen))
model.add(Flatten())
model.add(Dense(1, activation = "sigmoid"))

model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics = ['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1000, 100)         21606900  
_________________________________________________________________
flatten_3 (Flatten)          (None, 100000)            0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 100001    
Total params: 21,706,901
Trainable params: 21,706,901
Non-trainable params: 0
_________________________________________________________________


In [68]:
# Fit the model with just the author name
model.fit(X_train, y_train, epochs=1, verbose = 2,validation_data=(X_test, y_test))

Train on 18720 samples, validate on 2080 samples
Epoch 1/1
 - 118s - loss: 0.3141 - accuracy: 0.8486 - val_loss: 0.3196 - val_accuracy: 0.8418


<keras.callbacks.callbacks.History at 0x28a4c47a2e8>

In [73]:
# Fit the model with the author plus title
train_df = title_df+ author_df  

In [74]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.1, random_state=42)

In [75]:
model.fit(X_train, y_train, epochs=1, verbose = 2,validation_data=(X_test, y_test))

Train on 18720 samples, validate on 2080 samples
Epoch 1/1
 - 116s - loss: 0.2228 - accuracy: 0.9091 - val_loss: 0.2175 - val_accuracy: 0.9159


<keras.callbacks.callbacks.History at 0x28a05335080>

In [77]:
# Fit the model with the author plus title and text
train_df = title_df+author_df+text_df

In [78]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.1, random_state=42)

In [79]:
model.fit(X_train, y_train, epochs=1, verbose = 2,validation_data=(X_test, y_test))

Train on 18720 samples, validate on 2080 samples
Epoch 1/1
 - 116s - loss: 0.2852 - accuracy: 0.8940 - val_loss: 0.1814 - val_accuracy: 0.9298


<keras.callbacks.callbacks.History at 0x28a053352e8>

In [81]:
# Fit the model with title and text only
train_df = title_df+text_df

In [82]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.1, random_state=42)

In [83]:
model.fit(X_train, y_train, epochs=1, verbose = 2,validation_data=(X_test, y_test))

Train on 18720 samples, validate on 2080 samples
Epoch 1/1
 - 116s - loss: 0.0398 - accuracy: 0.9871 - val_loss: 0.1713 - val_accuracy: 0.9361


<keras.callbacks.callbacks.History at 0x28a2b60de10>

In [84]:
# Author and text
train_df = author_df+text_df

In [85]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.1, random_state=42)
model.fit(X_train, y_train, epochs=1, verbose = 2,validation_data=(X_test, y_test))

Train on 18720 samples, validate on 2080 samples
Epoch 1/1
 - 116s - loss: 0.0239 - accuracy: 0.9940 - val_loss: 0.1406 - val_accuracy: 0.9490


<keras.callbacks.callbacks.History at 0x28a05335dd8>

The text and title contain information about the author like the text style, topic and length of text so the addition of the author does not add much value to just the title and text.