In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

In [3]:
train = pd.read_csv("./datasets/train.csv")
test  = pd.read_csv("./datasets/test.csv")

In [4]:
train.head(5)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
print(f" number of data points in train set {train.__len__()}")
print(f" number of data points in test set {test.__len__()}")

 number of data points in train set 20800
 number of data points in test set 5200


In [6]:
train.info(),test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5200 entries, 0 to 5199
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5200 non-null   int64 
 1   title   5078 non-null   object
 2   author  4697 non-null   object
 3   text    5193 non-null   object
dtypes: int64(1), object(3)
memory usage: 162.6+ KB


(None, None)

In [7]:
print(f" total number of fake news {train[train['label']==0].__len__()} in train data set")
print(f" total number of real news {train[train['label']==1].__len__()} in train data set")

 total number of fake news 10387 in train data set
 total number of real news 10413 in train data set


In [8]:
# drop all null values
train= train.dropna()

In [9]:
print(f"number of data points in train set {train.__len__()}")

number of data points in train set 18285


In [10]:
## cleaning text 
import nltk ,re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import string 
from nltk.stem import WordNetLemmatizer

In [11]:
#defining the object for Lemmatization
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to C:\Users\Abhishek
[nltk_data]     pandir\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Abhishek
[nltk_data]     pandir\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
wordnet_lemmatizer = WordNetLemmatizer()

In [13]:
stopwords=stopwords.words('english')

In [14]:
stemmer=PorterStemmer()
# clean unwanted text like stopwords, @(Mention), https(url), #(Hashtag), punctuations
def removeUnwantedText(text):
    #remove urls
    if text == np.NaN or type(text) != str:
      text = " "
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'@\w+',' ',text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub('r<.*?>',' ', text)
    # html tags
    text = text.lower()
    text = text.split()
    text = " ".join([word for word in text if not word in stopwords])
    for punctuation in string.punctuation:
        text = text.replace(punctuation, "")
    return text

In [16]:

# curruntly on gpu
import multiprocessing as mp
p = mp.Pool(mp.cpu_count()) # Data parallelism Object
print(mp.cpu_count())

8


In [17]:
#train['title']= p.map(removeUnwantedText,train['title'])

In [18]:
train['title']=train['title'].apply(removeUnwantedText)
max(train['title'].apply( lambda x : len(x.split(" "))))

51

In [19]:
# calculating number of unique words

In [20]:
def lemmatize_text(text):
    text= text.lower()
    text= text.split(" ")
    text = " ".join([wordnet_lemmatizer.lemmatize(word) for word in text])
    return text

In [21]:
lemmatize_text(train['title'][1])

'flynn hillary clinton big woman campus  breitbart'

In [22]:
train['title']=train['title'].apply(lemmatize_text)

In [23]:
max(train['title'].apply( lambda x : len(x.split(" "))))

51

In [24]:
x = set()
for i in train['title']:
    for j in i.split(" "):
        x.add(wordnet_lemmatizer.lemmatize(j))
print(f" total number of unique words in  the all titles are {len(x)}")

 total number of unique words in  the all titles are 24578


In [25]:
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing import text
from tensorflow.keras import Sequential
from sklearn.model_selection import train_test_split

In [26]:
tokenizer = text.Tokenizer(num_words=25000)
tokenizer.fit_on_texts(train["title"])
def prep_text(texts, tokenizer, max_sequence_length):
    # Turns text into into padded sequences.
    text_sequences = tokenizer.texts_to_sequences(texts)
    return sequence.pad_sequences(text_sequences, maxlen=max_sequence_length)

In [27]:
# max sequence length 64
MAX_SEQ_LENGTH=64
## coverting to matrix 
x= prep_text(train['title'],tokenizer,MAX_SEQ_LENGTH)
x= np.array(x)
y =np.array(train['label'])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=4)

In [28]:
x[10]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,    48,
        2209,  2420,   189,  1201,   199,   921, 10790, 10791,  3580,
         250])

In [29]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(25000+2, 64, mask_zero=True,input_length=64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu',input_shape=(64,)),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 64, 64)            1600128   
_________________________________________________________________
bidirectional (Bidirectional (None, 64, 128)           66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 6

In [30]:
from IPython.display import clear_output
class DisplayCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs=None):
    clear_output(wait=True)
    
model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [32]:
history = model.fit(X_train,y_train,epochs=5,validation_data=(X_test,y_test),batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save("./models/linear")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


In [None]:
train['text']=train['text'].apply(removeUnwantedText)
print(max(train['text'].apply( lambda x : len(x.split(" ")))))

In [None]:
train['text'].apply( lambda x : len(x.split(" "))).quantile(0.95)

In [None]:
cuttoff=train['text'].apply( lambda x : len(x.split(" "))).quantile(0.95)
text_data=train[train['text'].apply( lambda x : len(x.split(" ")))<cuttoff]

In [None]:
x = set()
for i in text_data['text']:
    for j in i.split(" "):
        x.add(wordnet_lemmatizer.lemmatize(j))
print(f" total number of unique words in text the all corpus are {len(x)}")

In [None]:
max(text_data['text'].apply( lambda x : len(x.split(" "))))

In [None]:
tokenizer = text.Tokenizer(num_words=25000)
tokenizer.fit_on_texts(text_data["text"])

In [None]:
# max sequence length 64
MAX_SEQ_LENGTH=1152

## coverting to matrix 
x= prep_text(train['title'],tokenizer,MAX_SEQ_LENGTH)
x= np.array(x)
y =np.array(train['label'])
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=4)

In [None]:
X_train.shape

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(25000+2, 64, mask_zero=True,input_length=MAX_SEQ_LENGTH),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(1152,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
#     tf.keras.layers.LSTM(256,return_sequences=True),
#     tf.keras.layers.LSTM(256,return_sequences=False),
    tf.keras.layers.Dense(64,activation='relu'),
    tf.keras.layers.Dense(16,activation='relu'),
    tf.keras.layers.Dense(1,activation='sigmoid')
])
model.summary()

In [None]:
num_epochs = 5
lr=0.25
decay=0.25

In [None]:
for epoch in range(1,num_epochs):
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(),
              optimizer=tf.keras.optimizers.Adam(lr*decay),
              metrics=['accuracy']) 
    model.fit(X_train,y_train,epochs=1,validation_data=(X_test,y_test),batch_size=128)
    model.save(f"model_{epoch}.h5")
    lr=lr*decay

In [None]:
 model.evaluate(X_train,y_train)