In [35]:
import pandas as pd
import cudf
import tensorflow as tf
import warnings
import matplotlib.pyplot as plt
import numpy as np
from joblib import Parallel, delayed
import keras
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import log_loss

warnings.filterwarnings('ignore')
%matplotlib widget

In [2]:
data = pd.read_csv('final_features.csv', dtype=np.float32)

In [3]:
data.shape

(404290, 629)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 404290 entries, 0 to 404289
Columns: 629 entries, Unnamed: 0 to 299_y
dtypes: float32(629)
memory usage: 970.1 MB


In [5]:
data.head(2)

Unnamed: 0.1,Unnamed: 0,id,is_duplicate,cwc_min,cwc_max,csc_min,csc_max,ctc_min,ctc_max,last_word_eq,...,290_y,291_y,292_y,293_y,294_y,295_y,296_y,297_y,298_y,299_y
0,0.0,0.0,0.0,0.99998,0.833319,0.999983,0.999983,0.916659,0.785709,0.0,...,-17.810438,7.231024,1.531186,-7.528822,0.473802,-11.864658,-11.293788,1.866265,3.616046,11.971096
1,1.0,1.0,0.0,0.799984,0.399996,0.749981,0.599988,0.699993,0.466664,0.0,...,23.015827,3.435464,-5.1696,7.102491,34.51688,6.177687,-27.770857,12.926435,-4.564559,33.919834


In [6]:
train_data = data.drop(columns=['Unnamed: 0', 'id', 'is_duplicate'])

In [7]:
train_labels = data['is_duplicate']

## Splitting the dataset

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
x_train , x_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.3)

# Making model 

In [10]:
y_train.shape

(283003,)

In [11]:
x_train.shape

(283003, 626)

In [12]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, input_shape=([626]), activation='relu'),
    tf.keras.layers.Dense(128, activation = 'relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),    
])

In [13]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
model.fit(x_train, y_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f18dc3aac90>

In [15]:
model.evaluate(x_test, y_test)



[0.36293184757232666, 0.8248039484024048]

In [31]:
# This function plots the confusion matrices given y_i, y_i_hat.
def plot_confusion_matrix(test_y, predict_y):
    C = confusion_matrix(test_y, predict_y)
    # C = 9,9 matrix, each cell (i,j) represents number of points of class i are predicted class j
    
    A =(((C.T)/(C.sum(axis=1))).T)
    #divid each element of the confusion matrix with the sum of elements in that column
    
    # C = [[1, 2],
    #     [3, 4]]
    # C.T = [[1, 3],
    #        [2, 4]]
    # C.sum(axis = 1)  axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
    # C.sum(axix =1) = [[3, 7]]
    # ((C.T)/(C.sum(axis=1))) = [[1/3, 3/7]
    #                           [2/3, 4/7]]

    # ((C.T)/(C.sum(axis=1))).T = [[1/3, 2/3]
    #                           [3/7, 4/7]]
    # sum of row elements = 1
    
    B =(C/C.sum(axis=0))
    #divid each element of the confusion matrix with the sum of elements in that row
    # C = [[1, 2],
    #     [3, 4]]
    # C.sum(axis = 0)  axis=0 corresonds to columns and axis=1 corresponds to rows in two diamensional array
    # C.sum(axix =0) = [[4, 6]]
    # (C/C.sum(axis=0)) = [[1/4, 2/6],
    #                      [3/4, 4/6]] 
    plt.figure(figsize=(20,4))
    
    labels = [1,2]
    # representing A in heatmap format
    cmap=sns.light_palette("blue")
    plt.subplot(1, 3, 1)
    sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Confusion matrix")
    
    plt.subplot(1, 3, 2)
    sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Precision matrix")
    
    plt.subplot(1, 3, 3)
    # representing B in heatmap format
    sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Class')
    plt.ylabel('Original Class')
    plt.title("Recall matrix")
    
    plt.show()

In [27]:
y_pred = model.predict(x_test)
y_pred = y_pred.reshape(121287)
y_pred = [y_pred > 0.5]
y_pred = np.array(y_pred)
y_pred = y_pred.reshape(121287)

In [28]:
y_train_pred = model.predict(x_train)
y_train_pred = y_train_pred.reshape(-1)
y_train_pred = [y_train_pred > 0.5]
y_train_pred = np.array(y_train_pred)
y_train_pred = y_train_pred.reshape(-1)

In [32]:
plot_confusion_matrix(y_train_pred, y_train)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [33]:
plot_confusion_matrix(y_test, y_pred)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [34]:
print(model.evaluate(x_train, y_train), model.evaluate(x_test, y_test))

[0.32286784052848816, 0.842372715473175] [0.36293184757232666, 0.8248039484024048]
