In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading Essential libraries 
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'retina'
%config Completer.use_jedi = False # this to force autocompletion 

In [None]:
df = pd.read_csv('/kaggle/input/20-review-fake/20 mayfake reviews dataset.csv',)
df.head()

In [None]:
df = df.drop(columns=['category'])
df = df.drop(columns=['rating'])
df.info()


In [None]:
df.shape

In [None]:
df["label"].loc[df["label"]=="OR"]=0.0
df["label"].loc[df["label"]=="CG"]=1.0
df.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set()
sns.countplot(df.label)
plt.show()

In [None]:
df['word_length'] = df['text_'].apply(lambda x:len(x.split()))

In [None]:
plt.figure(figsize=(12, 8))

df[df.label== 0 ].word_length.plot(bins=35, kind='hist', color='blue', 
                                       label='Origial Review', alpha=0.6)
df[df.label == 1 ].word_length.plot(kind='hist', color='red', 
                                       label='Fake Review', alpha=0.6)
plt.legend()
plt.xlabel("Message Length")
plt.show()

In [None]:
df.groupby('label').mean()

In [None]:
# describing the hame(normal msgs )
df[df.label == 0].describe()

In [None]:
# Describing  the spam msgs
df[df.label == 1].describe()

**From here we can say that the longer text are more probable to become Spam msgs**

### now let do some text_preprocessing

In [None]:
pip install text-hammer

In [None]:
import text_hammer as th

In [None]:
%%time

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

def text_preprocessing(df,col_name):
    column = col_name
    df[column] = df[column].progress_apply(lambda x:str(x).lower())
    df[column] = df[column].progress_apply(lambda x: th.cont_exp(x)) #you're -> you are; i'm -> i am
    df[column] = df[column].progress_apply(lambda x: th.remove_emails(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_html_tags(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_stopwords(x))
#     df[column] = df[column].progress_apply(lambda x:th.spelling_correction(x))
  
    df[column] = df[column].progress_apply(lambda x: th.remove_special_chars(x))
    df[column] = df[column].progress_apply(lambda x: th.remove_accented_chars(x))
    df[column] = df[column].progress_apply(lambda x: th.make_base(x)) #ran -> run,
    return(df)

In [None]:
cleaned_df = text_preprocessing(df, "text_")

In [None]:
cleaned_df.text_

## Now lets do some EDA

In [None]:
import nltk



### Calculating the word frequency by using nltk
words_list = []
for sentence in cleaned_df.text_:
    words_list.extend(nltk.word_tokenize(sentence))
freq_dist = nltk.FreqDist(words_list)
freq_dist.most_common(20)
# freq_dist.keys()

In [None]:
temp = pd.DataFrame(freq_dist.most_common(30),  columns=['word', 'count'])
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x='word', y='count', 
            data=temp, ax=ax)
plt.title("Top words")
plt.xticks(rotation='vertical');

In [None]:
### Creating wordcloud
from wordcloud import WordCloud
import wordcloud
# creation of wordcloud
wcloud_fig = WordCloud( stopwords=set(wordcloud.STOPWORDS),
                      colormap='viridis', width=300, height=200).generate_from_frequencies(freq_dist)

# plotting the wordcloud
plt.figure(figsize=(10,7), frameon=True)

plt.imshow(wcloud_fig, interpolation  = 'bilinear')
plt.show()

### Module 2 
#### till now we have done all text cleaning and plottting part 

#### now lets split our data for some training and testing 

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(cleaned_df.text_, cleaned_df.label, test_size = 0.2, stratify = cleaned_df.label
                                                 ,random_state = 42)

### Using The tokenizer Class to convert the sentences into word vectors¶


In [None]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
num_words = 5000 # this means 15000 unique words can be taken 
tokenizer=Tokenizer(num_words,lower=True)
df_total = pd.concat([X_train, X_test], axis = 0)
tokenizer.fit_on_texts(df_total)


In [None]:
len(tokenizer.word_index) # this is whole unique words in our corpus
# but we have taken 10000 but we have only 8502 and the rest will be zero

In [None]:
cleaned_df.word_length.max()

In [None]:
X_train_ =tokenizer.texts_to_sequences(X_train)
X_test_ = tokenizer.texts_to_sequences(X_test)




In [None]:
X_train_pad=pad_sequences(X_train_,maxlen=171,padding='post')
X_test_pad = pad_sequences(X_test_, maxlen = 171, padding = 'post')


In [None]:
X_train_pad = X_train_pad.astype(np.float32)
X_test_pad = X_test_pad.astype(np.float32)
y_train = np.array(y_train).astype(np.int32)
y_test = np.array(y_test).astype(np.int32)


In [None]:
print(X_train_pad.shape,X_test_pad.shape) # this is our 2D matrix we can take this as Input data

### Now lets design our Deep learning model to train our data

### Method1: by using solely embedding layers

In [None]:
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, LSTM, Embedding,Bidirectional
import tensorflow
from tensorflow.compat.v1.keras.layers import CuDNNLSTM,CuDNNGRU
from tensorflow.keras.layers import Dropout

In [None]:
EMBEDDING_DIM = 100 # this means the embedding layer will create  a vector in 100 dimension
model = Sequential()
model.add(Embedding(input_dim = num_words,# the whole vocabulary size 
                          output_dim = EMBEDDING_DIM, # vector space dimension
                          input_length= X_train_pad.shape[1] # max_len of text sequence
                          ))
model.add(Dropout(0.2))
model.add(Bidirectional(CuDNNLSTM(100,return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(CuDNNLSTM(200,return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(CuDNNLSTM(100,return_sequences=False)))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam',metrics = 'accuracy')

In [None]:
#EarlyStopping and ModelCheckpoint

from keras.callbacks import EarlyStopping, ModelCheckpoint


es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 10)
mc = ModelCheckpoint('./model.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)

In [None]:
history_embedding = model.fit(X_train_pad,y_train, epochs = 35, batch_size = 120, validation_data=(X_test_pad, y_test),verbose = 1, callbacks= [es, mc]  )


In [None]:
plt.plot(history_embedding.history['accuracy'],c='b',label='train accuracy')
plt.plot(history_embedding.history['val_accuracy'],c='r',label='validation accuracy')
plt.legend(loc='lower right')
plt.show()


#### the maximum accurary we have got with wordembedding is 98 with some overfitting  now we would try with word2vec

### Method 2: Using word2vec **i'm gonna use gensim **

In [None]:
import gensim.downloader as api
glove_gensim  = api.load('glove-wiki-gigaword-100') # this would download vector with 100 dimension

In [None]:
glove_gensim['cat'].shape[0] # this is the diemnsion of the vectors

In [None]:
# now lets connvert all the words we have in our data into vectors 
vector_size = 100
gensim_weight_matrix = np.zeros((num_words ,vector_size))
gensim_weight_matrix.shape

for word, index in tokenizer.word_index.items():
    if index < num_words: # since index starts with zero 
        if word in glove_gensim.wv.vocab:
            gensim_weight_matrix[index] = glove_gensim[word]
        else:
            gensim_weight_matrix[index] = np.zeros(100)

In [None]:
gensim_weight_matrix.shape

In [None]:
# designing our architecture 
EMBEDDING_DIM = 100 # this means the embedding layer will create  a vector in 100 dimension
model_gensim = Sequential()
model_gensim.add(Embedding(input_dim = num_words,# the whole vocabulary size 
                          output_dim = EMBEDDING_DIM, # vector space dimension
                          input_length= X_train_pad.shape[1], # max_len of text sequence
                          weights = [gensim_weight_matrix],trainable = False))
model_gensim.add(Dropout(0.2))
model_gensim.add(Bidirectional(CuDNNLSTM(100,return_sequences=True)))
model_gensim.add(Dropout(0.2))
model_gensim.add(Bidirectional(CuDNNLSTM(200,return_sequences=True)))
model_gensim.add(Dropout(0.2))
model_gensim.add(Bidirectional(CuDNNLSTM(100,return_sequences=False)))
model_gensim.add(Dense(1, activation = 'sigmoid'))
model_gensim.compile(loss = 'binary_crossentropy', optimizer = 'adam',metrics = 'accuracy')

In [None]:
model_gensim.summary()

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint


es = EarlyStopping(monitor = 'val_loss', mode = 'min', verbose = 1, patience = 5)
mc = ModelCheckpoint('./model_gensim.h5', monitor = 'val_accuracy', mode = 'max', verbose = 1, save_best_only = True)

In [None]:
history_gensim = model_gensim.fit(X_train_pad,y_train, epochs = 25, batch_size = 120, validation_data=(X_test_pad, y_test),verbose = 1, callbacks= [es, mc]  )


In [None]:
# lets draw learning curve 

plt.plot(history_gensim.history['accuracy'],c='b',label='train accuracy')
plt.plot(history_gensim.history['val_accuracy'],c='r',label='validation accuracy')
plt.legend(loc='lower right')
plt.show()


### **model trained with word2vec is more generalised model so we will choose that as our final model **

In [None]:
model_gensim.evaluate(X_test_pad, y_test) 


In [None]:
model.evaluate(X_test_pad, y_test)

### Model Evaluation

In [None]:
y_pred = np.where(model.predict(X_test_pad)>.5,1,0)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_pred, y_test))

In [None]:
y_pred_gensim = np.where(model_gensim.predict(X_test_pad)>0.5,1,0)
print(metrics.classification_report(y_pred_gensim, y_test))

### Now lets test our model with real data

In [None]:
def predict(sentence):
    sentence_lst=[]
    sentence_lst.append(sentence)
    sentence_seq=tokenizer.texts_to_sequences(sentence_lst)
    sentence_padded=pad_sequences(sentence_seq,maxlen=171,padding='post')
    probability = (model_gensim.predict(sentence_padded))[0][0]
    if probability > 0.5:
        print(f'Fake Review | {np.round(probability*100,2)}%')
    else:
        print(f'Original Review | {100 - (np.round(probability*100,2))}%')


In [None]:
predict(str(input('Enter the Sentence:')))

In [None]:
predict(str(input('Enter the Sentence:')))