In [None]:
!pip install textstat
from zipfile import ZipFile
import pandas as pd
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import tensorflow as tf
import seaborn as sns

import os
import json

import cv2
from tqdm import tqdm

from sklearn.model_selection import train_test_split

import keras
from keras import layers
from keras.applications import DenseNet121
from keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.optimizers import Adam, Nadam
from keras import backend as K
from keras.models import Model, load_model
from keras.layers import Input
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.pooling import MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.layers.merge import concatenate
from keras import models, optimizers, metrics, regularizers, initializers

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import joblib
from keras.preprocessing.text import Tokenizer
import gensim
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.utils import shuffle

# Data processing

In [None]:
os.chdir('/kaggle/input')

In [None]:
train_raw = pd.read_csv(
    './training.1600000.processed.noemoticon.csv',
    encoding='latin-1',
    header=None)


train_raw.columns = ['target','ids','date','flag','user','text']

In [None]:
train = pd.concat([train_raw['target'], train_raw["text"]], axis=1)
train.target.replace({0:'negative',2:'neutral',4:'positive'},inplace=True)
posi_train = train.iloc[800001:,:].sample(n=100000, replace=False)
nega_train = train.iloc[:800000,:].sample(n=100000, replace=False)
train = pd.concat([posi_train, nega_train], axis=0)
train = shuffle(train).reset_index(drop=True)
train

In [None]:
stop_words = set(stopwords.words('english'))
#A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore
stop_words.remove('not')

In [None]:
corpus = []
for i in range(len(train)):
    review = re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+', ' ', train["text"][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if not word in stop_words]
    review = ' '.join(review)
    corpus.append(review)

train["text"]=corpus

# Word to Vec

In [None]:
documents = [text.split() for text in train["text"]]

In [None]:
import logging
from gensim.models import Word2Vec


w2v_model = gensim.models.word2vec.Word2Vec(size=300, window=7, min_count=10, workers=8)

In [None]:
w2v_model.build_vocab(documents)
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

w2v_model.train(
    documents,
    total_examples=len(documents),
    epochs=30,)


In [None]:
w2v_model.wv.most_similar("gd")

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train["text"])

In [None]:
tokenizer.word_index

In [None]:
import pickle

with open('/kaggle/working/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
vocab_size=len(tokenizer.word_index)+1 #because 0 reserve for unknown words, so we have to +1
vocab_size

In [None]:
x_train = pad_sequences(tokenizer.texts_to_sequences(train["text"]), maxlen=300)

print(x_train)

In [None]:
y_train = train["target"]
y_train.replace({'negative':0, 'positive':1},inplace=True)
y_train = y_train.to_numpy()
y_train

In [None]:
test = pd.read_csv('./test.csv')

In [None]:
x_test = pad_sequences(tokenizer.texts_to_sequences(test["text"]), maxlen=300)
x_test

# Embedding

In [None]:
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
embedding_layer = Embedding(
    vocab_size,
    300,
    weights=[embedding_matrix],
    input_length=300,
    trainable=False)

# Custom Functions

In [None]:
from keras import backend as K

def jaccard_distance_loss(y_true, y_pred, smooth=100):
    intersection = K.sum(K.abs(y_true * y_pred), axis=-1)
    sum_ = K.sum(K.abs(y_true) + K.abs(y_pred), axis=-1)
    jac = (intersection + smooth) / (sum_ - intersection + smooth)
    return (1 - jac) * smooth

In [None]:
smooth = 1e-12

def jaccard_coef(y_true, y_pred, smooth=1e-12):
    intersection = K.sum(y_true * y_pred, axis=[0, -1, -2])
    sum_ = K.sum(y_true + y_pred, axis=[0, -1, -2])
    jac = (intersection + smooth) / (sum_ - intersection + smooth)
    return K.mean(jac)

# Model LSTM

In [None]:
def build_model_LSTM():
    model = Sequential()
    model.add(embedding_layer)
    model.add(Dropout(0.5))
    model.add(LSTM(256, dropout=0.2, recurrent_dropout=0.2, 
                   kernel_regularizer=regularizers.l1(0.00001), 
                   kernel_initializer=initializers.RandomNormal(stddev=0.02)))
    model.add(BatchNormalization())
    model.add(Dense(1, activation='sigmoid'))

    model.compile(
        loss=jaccard_distance_loss,
        optimizer=optimizers.Adagrad(clipvalue=0.5),
        metrics=[jaccard_coef])

    return model

In [None]:
model = build_model_LSTM()
model.summary()

In [None]:
# sample weights

from sklearn.utils import class_weight
sample_weights = class_weight.compute_sample_weight('balanced', y_train)

In [None]:
checkpoint = ModelCheckpoint(
    '/kaggle/working/train_model_LSTM.h5', 
    monitor='val_jaccard_coef', 
    verbose=1, 
    save_best_only=True, 
    save_weights_only=False,
    mode='auto'
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    patience=5,
    verbose=1,
    min_lr=5e-5
)

model_history = model.fit(
    x_train,
    y_train,
    batch_size=1024,
    epochs=15,
    validation_split=0.15,
    verbose=1,
    callbacks=[checkpoint, reduce_lr])

In [None]:
acc = model_history.history['jaccard_coef']
val_acc = model_history.history['val_jaccard_coef']
loss = model_history.history['loss']
val_loss = model_history.history['val_loss']
epochs=range(len(acc))

In [None]:
plt.figure(facecolor='white')
plt.plot(epochs,acc,label='Training_acc',color='blue')
plt.plot(epochs,val_acc,label='Validation_acc',color='red')
plt.legend()
plt.title("Training and Validation Accuracy")

In [None]:
plt.figure(facecolor='white')
plt.plot(epochs,loss,label='Training_loss',color='blue')
plt.plot(epochs,val_loss,label='Validation_loss',color='red')
plt.legend()
plt.title("Training and Validation loss")

In [None]:
model = load_model('train_model_LSTM.h5', custom_objects={'jaccard_distance_loss':jaccard_distance_loss, 'jaccard_coef':jaccard_coef})

In [None]:
def preprocess(text):
    review=re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+',' ',text)
    review=review.lower()
    review=review.split()
    review=[word for word in review if not word in stop_words]
    review_tokenized=pad_sequences(tokenizer.texts_to_sequences([review]), maxlen=300)
    return review, review_tokenized

In [None]:
preprocess("the food is not good")

In [None]:
model.predict(preprocess("the food is not good")[1])

In [None]:
def prediction(review):
    review=preprocess(review)[1]
    score=model.predict(review)
    score=score[0]
    if score<0.4:
        print("Negative")
    elif (score>0.4 and score<0.6):
        print("Neutral")
    else:
        print("Positive")
    return score

In [None]:
prediction("the food is not good")

In [None]:
x_test

In [None]:
scores = model.predict(x_test, verbose=1, batch_size=1024)

In [None]:
print(scores.max())
print(scores.min())

In [None]:
scores_scaled = (scores - scores.min(axis=0)) / (scores.max(axis=0) - scores.min(axis=0))

In [None]:
y_pred = np.zeros(len(scores_scaled)).astype(str)
for i, score in enumerate(scores_scaled):
    if score>0.6 :
        y_pred[i] = "positive"
    elif 0.6>=score>=0.4 :
        y_pred[i] = "neutral"
    else:
        y_pred[i] = "negative"
y_pred

In [None]:
def get_results(y_true, y_pred):
  y_ans = [i for i, x in enumerate(y_true) if x=='positive' or x=='negative'] 
  predictions = [i for i, x in enumerate(y_pred) if x=='positive' or x=='negative']
  keep = set(y_ans).intersection(predictions) # list of indices with 0s and 1s in both the answer list and the prediction list
  predictions_final = [y_pred[i] for i in keep]
  y_ans_final = [y_true[i] for i in keep]
  return y_ans_final, predictions_final

In [None]:
results = np.array(get_results(test['sentiment'], y_pred))

In [None]:
print(classification_report(results[0], results[1]))

# Model FNN

In [None]:
from keras import models, layers, regularizers

def build_model_FNN():
    model = models.Sequential()
    model.add(Dense(512, activation="relu", input_shape=(len(x_train[0]),)))
    model.add(Dense(256, activation="relu", input_shape=(len(x_train[0]),)))
    model.add(Dense(128, activation="relu"))
    model.add(Dense(64, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(Dense(32, activation="relu"))
    model.add(BatchNormalization())
    model.add(Dense(1, activation="sigmoid"))

    model.compile(optimizer=optimizers.Adam(),
                  loss=jaccard_distance_loss,
                  metrics=['accuracy'])
    
    return model

In [None]:
fnn_model = build_model_FNN()

In [None]:
checkpoint = ModelCheckpoint(
    '/kaggle/working/train_model_FNN.h5', 
    monitor='val_accuracy', 
    verbose=1, 
    save_best_only=True, 
    save_weights_only=False,
    mode='auto'
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    patience=10,
    verbose=1,
    min_lr=5e-5
)

model_history = fnn_model.fit(
    x_train,
    y_train,
    batch_size=1024,
    epochs=30,
    validation_split=0.15,
    verbose=1,
    callbacks=[checkpoint, reduce_lr])

In [None]:
acc = model_history.history['jaccard_coef']
val_acc = model_history.history['val_jaccard_coef']
loss = model_history.history['loss']
val_loss = model_history.history['val_loss']
epochs=range(len(acc))

In [None]:
plt.plot(epochs,acc,label='Trainin_acc',color='blue')
plt.plot(epochs,val_acc,label='Validation_acc',color='red')
plt.legend()
plt.title("Training and Validation Accuracy")

In [None]:
plt.plot(epochs,loss,label='Training_loss',color='blue')
plt.plot(epochs,val_loss,label='Validation_loss',color='red')
plt.legend()
plt.title("Training and Validation loss")

In [None]:
# fnn_model = load_model('/content/train_model_FNN.h5')
fnn_scores = fnn_model.predict(x_test, verbose=1, batch_size=1024)
fnn_scores_scaled = (fnn_scores - fnn_scores.min(axis=0)) / (fnn_scores.max(axis=0) - fnn_scores.min(axis=0))

fnn_y_pred = np.zeros(len(fnn_scores_scaled)).astype(str)
for i, score in enumerate(fnn_scores_scaled):
    if score>0.6 :
        fnn_y_pred[i] = "positive"
    elif 0.6>=score>=0.4 :
        fnn_y_pred[i] = "neutral"
    else:
        fnn_y_pred[i] = "negative"
fnn_y_pred

In [None]:
results = np.array(get_results(test['sentiment'], fnn_y_pred))

In [None]:
print(classification_report(results[0], results[1]))