In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../dataset/train.csv")

In [None]:
# Dataset Description

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
df.isnull().sum()
df = df.dropna()

In [None]:
# Liter Dataset
#df = df.sample(40000)

In [None]:
questions_df = df[['question1','question2']]

In [None]:
!pip install contractions
!pip install session_info

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
import re
import contractions

def tokenize(text):
    return word_tokenize(text)

def remove_stopwords(text):
    list_of_words = tokenize(text)
    _stopwords = set(stopwords.words("english"))
    filtered_list = [word + ' ' for word in list_of_words if word.casefold() not in _stopwords]
    return (''.join(filtered_list))[: -1] # to remove space at the end of string

def lemmatize(text):
    words = tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lem_words = [lemmatizer.lemmatize(word, pos='v')+' ' for word in words]
    return (''.join(lem_words))[: -1] # to remove space at the end of string

def preprocessing(text):
    text = re.sub('[0-9]+\.[0-9]+', "1", str(text)) # Remove float numbers
    constructed_txt = contractions.fix(text) # Remove shortened words such as "we'd" and return them to original words
    lem_txt = lemmatize(constructed_txt) # lemmatize words to get the original ones
    free_stopwords_txt = remove_stopwords(lem_txt) # remove stopwords
    text = ''.join([c for c in free_stopwords_txt if c not in punctuation]).lower()
    return text
    #print(text)

#preprocessing("youssef she's got 5.6 new dog this ?")
questions_df['question1'] = questions_df['question1'].apply(preprocessing)
questions_df['question2'] = questions_df['question2'].apply(preprocessing)

In [None]:
import session_info
session_info.show()

In [None]:
#print(questions_df['question1'])
#print(questions_df['question2'])
# ddf = pd.concat([questions_df['question1'],questions_df['question2']])
# print(ddf['question1'])
# print(ddf['question2'])


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import scipy
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras import losses 
from tensorflow.keras import optimizers
from tensorflow.keras import metrics


count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
new_df = pd.concat((questions_df['question1'],questions_df['question2']))
cv = count_vect.fit_transform(new_df)
trainq1_trans = count_vect.transform(questions_df['question1'].values.astype(str))
trainq2_trans = count_vect.transform(questions_df['question2'].values.astype(str))

labels = df['is_duplicate'].values
x = scipy.sparse.hstack((trainq1_trans,trainq2_trans))
y = labels
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.33, random_state = 42)

keras_model = Sequential()
keras_model.add(Dense(12, input_dim=x.get_shape()[1], activation='relu'))
keras_model.add(Dense(8, activation='relu'))
keras_model.add(Dense(1, activation='sigmoid'))

# print(x.get_shape())
# print(x_train.get_shape())

keras_model.compile(loss = 'mean_squared_error', optimizer = 'sgd', metrics = [metrics.categorical_accuracy])



keras_model.fit(x_train, y_train, epochs=50, batch_size=10)

y_predict = keras_model.predict(x_test)

In [None]:
from sklearn.metrics import f1_score, classification_report, accuracy_score
keras_model.save("keras_model.h5")

y_predict = np.array(y_predict)
y_predict = y_predict.flatten()
print(y_predict)

y_train_predict = (keras_model.predict(x_train) >= 0.5).astype(int)
y_predict = (keras_model.predict(x_test) >= 0.5).astype(int)

print('training F1 score:', f1_score(y_train, y_train_predict, average='macro'))
print('testing F1 score:', f1_score(y_test, y_predict, average='macro'))

print("\ntesting Accuracy: ", str(round(accuracy_score(y_test, y_predict)*100, 2)),'%')

print('\ntesting report: ')
print(classification_report(y_test, y_predict))

In [None]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(max_depth=100, n_estimators=90, learning_rate=0.3, colsample_bytree=.7, gamma=0, reg_alpha=3, objective='binary:logistic', silent=1, subsample=0.8).fit(x_train, y_train) 
xgb_prediction = xgb_model.predict(x_test)

In [None]:
from sklearn.metrics import f1_score, classification_report, accuracy_score
print('training F1 score:', f1_score(y_train, xgb_model.predict(x_train), average='macro'))
print('testing F1 score:', f1_score(y_test, xgb_prediction, average='macro'))

print("\ntesting Accuracy: ", str(round(accuracy_score(y_test, xgb_prediction)*100, 2)),'%')

print('\ntesting report: ')
print(classification_report(y_test, xgb_prediction))