In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os


In [71]:
train = pd.read_csv("./datasets/train.csv")
test  = pd.read_csv("./datasets/test.csv")

In [72]:
train.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [73]:
print(f" number of data points in train set {train.__len__()}")
print(f" number of data points in test set {test.__len__()}")

 number of data points in train set 20800
 number of data points in test set 5200


In [74]:
train.info(),test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


(None, None)

In [75]:
print(f" total number of fake news {train[train['label']==0].__len__()} in train data set")
print(f" total number of real news {train[train['label']==1].__len__()} in train data set")

 total number of fake news 10387 in train data set
 total number of real news 10413 in train data set


In [76]:
# drop all null values
train= train.dropna()

In [77]:
print(f"number of data points in train set {train.__len__()}")


number of data points in train set 18285


In [78]:
## cleaning text 
import nltk ,re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string 
from nltk.stem import WordNetLemmatizer

In [79]:
#defining the object for Lemmatization
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to C:\Users\Abhishek
[nltk_data]     pandir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Abhishek
[nltk_data]     pandir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [80]:
wordnet_lemmatizer = WordNetLemmatizer()

In [81]:
stopwords=stopwords.words('english')

In [82]:
stemmer=PorterStemmer()
# clean unwanted text like stopwords, @(Mention), https(url), #(Hashtag), punctuations
def removeUnwantedText(text):
    #remove urls
    if text == np.NaN or type(text) != str:
      text = " "
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'@\w+',' ',text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub('r<.*?>',' ', text)
    # html tags
    text = text.lower()
    text = text.split()
    text = " ".join([word for word in text if not word in stopwords])
    for punctuation in string.punctuation:
        text = text.replace(punctuation, "")
    return text

In [83]:
removeUnwantedText("curruntly on gpu")

'curruntly gpu'

In [84]:

# curruntly on gpu
# import multiprocessing as mp
# p = mp.Pool(mp.cpu_count()) # Data parallelism Object
# print(mp.cpu_count())

In [85]:
#train['title']= p.map(removeUnwantedText,train['title'])

In [86]:
train['title']=train['title'].apply(removeUnwantedText)

In [87]:
max(train['title'].apply( lambda x : len(x.split(" "))))

51

In [88]:
train['title'][1]

'flynn hillary clinton big woman campus  breitbart'

In [89]:
# calculating number of unique words 

In [90]:
x = set()
for i in train['title']:
    for j in i.split(" "):
        x.add(j)

In [91]:
print(f" total number of unique words in  the all titles are {len(x)}")

 total number of unique words in  the all titles are 26788


In [104]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text
from tensorflow.keras import Sequential

In [93]:
tokenizer = text.Tokenizer(num_words=24000)

In [94]:
tokenizer.fit_on_texts(train["title"])


In [95]:
def prep_text(texts, tokenizer, max_sequence_length):
    # Turns text into into padded sequences.
    text_sequences = tokenizer.texts_to_sequences(texts)
    return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length)

In [96]:
# max sequence length 64
MAX_SEQ_LENGTH=64

In [97]:
x= prep_text(train['title'],tokenizer,MAX_SEQ_LENGTH)

In [116]:
x= np.array(x)

In [117]:
y =np.array(train['label'])

In [118]:
from sklearn.model_selection import train_test_split


In [119]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=4)

In [120]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(24000+1, 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation='sigmoid')
])

In [121]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 64)          1536064   
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 128)         66048     
_________________________________________________________________
bidirectional_5 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 1,647,553
Trainable params: 1,647,553
Non-trainable params: 0
____________________________________________

In [122]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])


In [123]:
history = model.fit(X_train,y_train,epochs=2,validation_data=(X_test,y_test),batch_size=64)

Epoch 1/2
Epoch 2/2
