In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
X=df.drop('label',axis=1)
y=df['label']

In [5]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

In [6]:
df.shape

(20800, 5)

In [7]:
df_train=df.copy()

In [8]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
title_author=(df['title']+ ' ' + df['author'])

In [10]:
title_author[0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It Darrell Lucus'

In [11]:
test=pd.read_csv('test.csv')

In [12]:
df_test=test.copy()

In [13]:
test.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [14]:
#filling NULL values with empty string
df=df.fillna('')
test=test.fillna('')


In [15]:
# We will be only using title and author name for prediction
# Creating new coolumn total concatenating title and author
df['total'] = df['title']+' '+df['author']
test['total']=test['title']+' '+test['author']


In [16]:
X = df.drop('label',axis=1)
y=df['label']
print(X.shape)
print(y.shape)


(20800, 5)
(20800,)


In [18]:
#Downloading stopwords 
import nltk
import re
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91999\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
#Lemmatizing map words to their root forms
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
wordnet = WordNetLemmatizer()
stop_words = stopwords.words('english')

In [20]:
#Applying stemming and some preprocessing
def clean_text(text):
    text = text.lower() # lowering
    text = text.encode("ascii", "ignore").decode() # non ascii chars
    text = re.sub(r'\n',' ', text) # remove new-line characters
    text = re.sub(r'\W', ' ', text) # special chars
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text) # single characters
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text) # single char at first
    text = re.sub(r'[0-9]', ' ', text) # digits
    text = re.sub(r'\s+', ' ', text, flags=re.I) # multiple spaces
    text=text.split()
    return ' '.join([wordnet.lemmatize(word) for word in text if word not in stop_words])

In [21]:
df['total']=df['total'].apply(clean_text)

In [22]:
test['total']=test['total'].apply(clean_text)

In [23]:
df['total'][1]

'flynn hillary clinton big woman campus breitbart daniel flynn'

In [24]:
df['title'][1]

'FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart'

In [25]:
X=df['total']
y=df['label']

In [30]:
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot


In [31]:
# Converting to one hot representation
onehot_rep = [one_hot(words,voc_size)for words in X]


In [32]:
#Padding Sentences to make them of same size
embedded_docs = pad_sequences(onehot_rep,padding='pre',maxlen=25)


In [33]:
#Choosing vocabulary size to be 5000
voc_size=5000

In [34]:
#We have used embedding layers with LSTM
model = Sequential()
model.add(Embedding(voc_size,40,input_length=25))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dropout(0.3))
model.add(Dense(64,activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 25, 40)            200000    
                                                                 
 dropout (Dropout)           (None, 25, 40)            0         
                                                                 
 lstm (LSTM)                 (None, 100)               56400     
                                                                 
 dropout_1 (Dropout)         (None, 100)               0         
                                                                 
 dense (Dense)               (None, 64)                6464      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 6

In [35]:
#Converting into numpy array
X_final = np.array(embedded_docs)
y_final = np.array(y)


In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [40]:
### Finally Training
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1fae0c59488>

In [51]:
y_pred=model.predict(X_test)

In [53]:
y_pred=np.round(y_pred)

In [56]:
y_pred.astype('int32')

array([[1],
       [1],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [60]:
from sklearn.metrics import confusion_matrix,classification_report

In [61]:
confusion_matrix(y_test,y_pred)

array([[3448,    1],
       [   5, 3410]], dtype=int64)

In [62]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3449
           1       1.00      1.00      1.00      3415

    accuracy                           1.00      6864
   macro avg       1.00      1.00      1.00      6864
weighted avg       1.00      1.00      1.00      6864

