In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive
from google.colab import files
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error

Basic Data Exploration

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Data Science/Practice/FAKE_NEWS_LSTM/train.csv")
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.shape

(20800, 5)

In [5]:
df = df.dropna()

One Hot Encoding

In [6]:
df1=df.copy()
df1['nlp'] = df['title'] + df['text'] + df['author']
df1 = df1[['nlp','label']]
df1.columns =['nlp','label']
df1['text_length'] = df1['nlp'].apply(lambda x: len(x.split()))
df1.reset_index(inplace=True)
df1.head()

Unnamed: 0,index,nlp,label,text_length
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,834
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0,720
2,2,Why the Truth Might Get You FiredWhy the Truth...,1,1272
3,3,15 Civilians Killed In Single US Airstrike Hav...,1,567
4,4,Iranian woman jailed for fictional unpublished...,1,168


In [7]:
#mean sentence length
np.mean(df1['text_length'])

816.2422203992344

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
from nltk.tokenize import word_tokenize

In [10]:
import re
def tokenize(text):
    temp = re.sub('[^a-zA-Z]', ' ', text)
    temp = temp.lower()
    return [token for token in word_tokenize(temp)]

In [11]:
#count number of occurences of each word
counts = Counter()
for index, row in df1.iterrows():
    counts.update(tokenize(row['nlp']))

In [12]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 150236
num_words after: 88315


In [13]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [43]:
import pickle
# open a file, where you ant to store the data
file = open('/content/drive/MyDrive/Data Science/Practice/FAKE_NEWS_LSTM/words.pkl', 'wb')

# dump information to that file
pickle.dump(words, file)

In [41]:
import pickle
# open a file, where you ant to store the data
file = open('/content/drive/MyDrive/Data Science/Practice/FAKE_NEWS_LSTM/vocab2index.pkl', 'wb')

# dump information to that file
pickle.dump(vocab2index, file)

In [14]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [15]:
df1['encoded'] = df1['nlp'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df1.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,index,nlp,label,text_length,encoded
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,1,834,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",0,720,"[[358, 83, 84, 359, 360, 21, 361, 1, 362, 24, ..."
2,2,Why the Truth Might Get You FiredWhy the Truth...,1,1272,"[[546, 24, 543, 630, 362, 345, 1, 24, 543, 630..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,1,567,"[[1046, 1047, 26, 1048, 425, 1049, 188, 189, 1..."
4,4,Iranian woman jailed for fictional unpublished...,1,168,"[[1188, 360, 1189, 175, 1190, 1191, 756, 92, 3..."


In [16]:
X = list(df1['encoded'])
y = list(df1['label'])
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx]#, self.X[idx][1]

In [18]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_val, y_val)

In [19]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [20]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [21]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)
model_fixed =  LSTM_fixed_len(vocab_size, 50, 50)

In [22]:
train_model(model_fixed, epochs=30, lr=0.01)

train loss 0.994, val loss 0.717, val accuracy 0.569, and val rmse 0.656
train loss 0.671, val loss 0.672, val accuracy 0.571, and val rmse 0.655
train loss 0.443, val loss 0.431, val accuracy 0.830, and val rmse 0.412
train loss 0.219, val loss 0.321, val accuracy 0.866, and val rmse 0.366
train loss 0.111, val loss 0.307, val accuracy 0.908, and val rmse 0.304
train loss 0.058, val loss 0.287, val accuracy 0.919, and val rmse 0.285


PREDICTION

In [23]:
# Test the model
y_pred_list = []
y_test_list = []
with torch.no_grad():
  model_fixed.eval()  # eval mode 
  correct = 0
  total = 0
  for x, y in val_dl:
    x = x.long()
    y = y.long()
    y_hat = model_fixed(x)
    pred = torch.max(y_hat, 1)[1]
    y_pred_list.append(pred)
    y_test_list.append(y)
    total += y.size(0)
    correct += (pred == y).sum().item()

y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
#y_test_list = [a.squeeze().tolist() for a in y_test_list]

print('Accuracy : {} %'.format(100 * correct / total))

# flatten prediction and true lists
flat_pred = []
flat_true = []
for i in range(len(y_pred_list)):
    for j in range(len(y_pred_list[i])):
        flat_pred.append(y_pred_list[i][j])
        flat_true.append(y_test_list[i][j])

from sklearn.metrics import classification_report
target_names = ['REAL', 'FAKE']
print("\n")
print("\t\t CLASSIFICATION REPORT\n")
print(classification_report(flat_true, flat_pred,target_names=target_names))

Accuracy : 91.98796828001093 %


		 CLASSIFICATION REPORT

              precision    recall  f1-score   support

        REAL       0.94      0.92      0.93      2082
        FAKE       0.89      0.92      0.91      1575

    accuracy                           0.92      3657
   macro avg       0.92      0.92      0.92      3657
weighted avg       0.92      0.92      0.92      3657



In [40]:
import pickle
# open a file, where you ant to store the data
file = open('/content/drive/MyDrive/Data Science/Practice/FAKE_NEWS_LSTM/LSTM_model.pkl', 'wb')

# dump information to that file
pickle.dump(model_fixed, file)

In [39]:
word1 = pickle.load(open('/content/word.pkl','rb'))
type(word1)

list

In [38]:
dict1 = pickle.load(open('/content/vocab2index.pkl','rb'))
type(dict1)

dict