# Importing Libraries

In [56]:
import lightgbm as lgb
from xgboost import XGBClassifier
import sklearn
import keras
import tensorflow as tf
from scipy.sparse import hstack
import gensim
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, GlobalMaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import f1_score,classification_report,accuracy_score

# Data Loding and Manipulation

In [4]:
fake = pd.read_csv("fake_data.csv")
real = pd.read_csv("real_data.csv")
fake["label"] = 0
real["label"] = 1

In [5]:
posts = pd.concat([fake,real])

In [6]:
posts = posts.drop(["Unnamed: 0"],axis=1)
posts.head()

Unnamed: 0,news_url,title,label
0,dailymail.co.uk,did miley cyrus and liam hemsworth secretly ge...,0
1,hollywoodlife.com,paris jackson cara delevingne enjoy night out ...,0
2,variety.com,celebrities join tax march in protest of donal...,0
3,dailymail.co.uk,cindy crawford s daughter kaia gerber wears a ...,0
4,variety.com,full list of oscar nominations variety,0


In [7]:
posts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22140 entries, 0 to 16816
Data columns (total 3 columns):
news_url    21871 non-null object
title       22140 non-null object
label       22140 non-null int64
dtypes: int64(1), object(2)
memory usage: 691.9+ KB


In [8]:
posts = posts.fillna("unknown")

In [9]:
posts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22140 entries, 0 to 16816
Data columns (total 3 columns):
news_url    22140 non-null object
title       22140 non-null object
label       22140 non-null int64
dtypes: int64(1), object(2)
memory usage: 691.9+ KB


In [11]:
posts['merged']= posts.news_url + " " + posts.title

In [70]:
posts = posts.sample(frac=1)

### Generating completing posts datset

In [68]:
posts.to_csv("fake_real_dataset.csv")

# Vectorizing the words

In [12]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=10000)
word_vectorizer.fit(posts.merged)
train_word_features = word_vectorizer.transform(posts.merged)

In [13]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=10000)
char_vectorizer.fit(posts.merged)
train_char_features = char_vectorizer.transform(posts.merged)

In [19]:
#making sparce features for better prediction
y = posts.label
train_features = hstack([train_char_features, train_word_features])

### Splitting the data into train and test set

In [21]:
X_train, X_test, y_train, y_test = train_test_split(train_features,y,test_size=0.20,random_state=42)

# Applying Machine Learning

## Applying Random Forest Algorithm

In [48]:
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train,y_train)
preds=classifier.predict(X_test)


0.8622402890695574


In [51]:
print(accuracy_score(y_test,preds))

0.8622402890695574


## Applying Logistic Regression Alogrithm

In [53]:
classifier = LogisticRegression()
classifier.fit(X_train,y_train)
preds=classifier.predict(X_test)



In [54]:
print(accuracy_score(y_test,preds))

0.8832429990966576


## Applying XGBoost Algorithm

In [45]:
xgb=XGBClassifier()
xgb.fit(X_train,y_train)
preds2=xgb.predict(X_test)


0.8468834688346883


In [50]:
print(accuracy_score(y_test,preds2))

0.8468834688346883


## Applying LightGBM Algorithm

In [41]:
params = {
          "objective" : "binary",
          "num_leaves" : 60,
          "max_depth": 10,
          "learning_rate" : 0.01,
          "bagging_fraction" : 0.9,  # subsample
          "feature_fraction" : 0.9,  # colsample_bytree
          "bagging_freq" : 5,        # subsample_freq
          "bagging_seed" : 2018,
          "verbosity" : -1 }
train_data=lgb.Dataset(X_train,label=y_train)
lgbm=lgb.train(params, train_data,50)
preds=lgbm.predict(X_test)


In [49]:
y_pred = []
for ii in range(len(preds)):
    if(preds[ii]>0.5):
        y_pred.append(1)
    else:
        y_pred.append(0)
print(accuracy_score(y_test,y_pred))

0.8622402890695574


## Applying Ensamble of Machine Learning Algorithms

In [57]:
def predict_one(x, y, xt):
    clf1 = LogisticRegression()
    clf2 = RandomForestClassifier(n_estimators=100)
    c = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2)],voting='soft')
    c.fit(x, y)
    y_pred = c.predict(xt)
    return y_pred
uu = predict_one(X_train,y_train,X_test)
print(accuracy_score(y_test,uu))

0.8775971093044264


# Applying Deep Learning

In [58]:
num_labels = 8
vocab_size = 5000
batch_size = 64

In [59]:
train_X = posts.news_url+" "+posts.title
train_X = train_X.values
y = posts.label

In [60]:
seq = train_X

In [61]:
tokn = Tokenizer(num_words=vocab_size)
tokn.fit_on_texts(seq)

In [62]:
max_len = 20
cnn_texts_seq = tokn.texts_to_sequences(train_X)
print(cnn_texts_seq[0])
cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=max_len)
print(cnn_texts_mat[0])
print(cnn_texts_mat.shape)

[24, 15, 16, 145, 286, 245, 4, 339, 500, 889, 104, 188]
[  0   0   0   0   0   0   0   0  24  15  16 145 286 245   4 339 500 889
 104 188]
(22140, 20)


### Conv1d Neural Network

In [67]:
def cnn_model():
    model = Sequential()
    model.add(Embedding(5000,50,input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Conv1D(256,3,padding='valid',activation='relu',strides=1))
    #model.add(Conv1D(128,3,padding='valid',activation='relu',strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128))
    model.add(Dropout(0.5))
    model.add(Activation('relu'))
    #model.add(Dense(64))
    #model.add(Dropout(0.5))
    #model.add(Activation('relu'))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

def check_model(model,x,y):
    #es = EarlyStopping(monitor='val_loss', mode='min', verbose=1,restore_best_weights=True,patience=10)
    model.fit(x,y,batch_size=32,epochs=20,verbose=1,validation_split=0.2)


m = cnn_model()
check_model(m,cnn_texts_mat,y)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 20, 50)            250000    
_________________________________________________________________
dropout_9 (Dropout)          (None, 20, 50)            0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 18, 256)           38656     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 256)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
activation_9 (Activation)    (None, 128)               0         
__________

### MLP Netork

### LSTM Network

### RNN Network

### GRU Network

### Other Sequential Counter Parts