In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding, MultiHeadAttention, Dense, InputLayer, Input, Dot, GlobalAveragePooling1D
from keras.losses import BinaryCrossentropy
from keras.activations import sigmoid, relu
from keras.models import Model
from keras.utils import plot_model, model_to_dot
import pydot
import graphviz
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




In [None]:
df = pd.read_csv("questions.csv")



In [None]:
print(df.head())
print("\n-------------------------------\n")
print(df.info())


In [None]:
df.isnull().sum()
df.dropna(inplace=True)
df.isnull().sum()


In [None]:
df["question1"].duplicated().sum()


In [None]:
df["is_duplicate"].value_counts()


In [None]:
ddf = pd.Series(df["question1"].to_list()+df["question2"].to_list())
np.unique(ddf).shape


In [None]:
x = ddf.value_counts()>1
x[x].shape


In [None]:
# df.drop(columns=["id", "qid1","qid2"], inplace = True)

df_small = df[:30000]
df_small.shape


In [None]:
df_small.shape


In [None]:
questions_list = df_small["question1"].to_list()+df_small["question2"].to_list()
len(questions_list)

In [None]:
q_counts = pd.Series(questions_list).value_counts().values

In [None]:
plt.hist(q_counts, bins = 160)
plt.yscale('log')
plt.show()


In [None]:
vocab_size = 20000
oov = "oov"

tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov)

tokenizer.fit_on_texts(questions_list)


In [None]:
tokenized_questions = tokenizer.texts_to_sequences(questions_list)
len(tokenized_questions)


In [None]:
len(tokenized_questions)

In [None]:
text_lens = [len(tokenized_text) for tokenized_text in tokenized_questions]
# print(np.array(text_lens))
oversized = []
for i, length in enumerate(text_lens):
  if length>20 or length<1:
    oversized.append(i)

len(oversized)
# oversized




In [None]:
text_lens = [len(tokenized_text) for tokenized_text in tokenized_questions]
min(text_lens)


In [None]:

# pad_lens = [len(padded_sequence) for padded_sequence in padded_sequences]

plt.hist(text_lens, bins=100)
# plt.xlim((0,))
plt.show()

In [None]:
padded_sequences = pad_sequences(tokenized_questions, maxlen=40, padding='post')
padded_sequences.shape

In [None]:
q1_input = Input(shape=(40,), name="q1")
q2_input = Input(shape=(40,), name="q2")

embedding = Embedding(len(tokenizer.word_counts)+2, 512, input_length=40, name="Embedding")

q1_embedding = embedding(q1_input)
q2_embedding = embedding(q2_input)

attention = MultiHeadAttention(num_heads=8, key_dim=64, dropout=0.2)

q1_attended = attention(query=q1_embedding, key=q1_embedding, value=q1_embedding)
q2_attended = attention(query=q2_embedding, key=q2_embedding, value=q2_embedding)

pooling = GlobalAveragePooling1D()

q1_pooled = pooling(q1_attended)
q2_pooled = pooling(q2_attended)

similarity = Dot(axes=1, normalize=True)([q1_pooled, q2_pooled])

fc_layer = Dense(units=1, activation=sigmoid)(similarity)

model = Model(inputs=[q1_input, q2_input], outputs=fc_layer, name="embedder_model")

In [None]:

model.summary()

In [None]:
plot_model(model, show_shapes=True, show_layer_names=True)

In [None]:
from sklearn.preprocessing import StandardScaler



new_data = np.vsplit(padded_sequences,2)

q1=new_data[0]
q2=new_data[1]


# q1_df = pd.DataFrame(q1)
# q2_df = pd.DataFrame(q2)





In [None]:
scaler = StandardScaler()
q1_sc = np.array([scaler.fit_transform(row.reshape(-1, 1)).flatten() for row in q1])
q2_sc = np.array([scaler.fit_transform(row.reshape(-1, 1)).flatten() for row in q2])

# q1 = scaler.fit_transform()
# q2 = scaler.transform(q2)
q1_sc[0].std()

In [None]:
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# res = model.predict([q1[:100], q2[:100]])

# print(res)

In [None]:
y = np.array(df_small["is_duplicate"])
y.shape

In [None]:
padded_sequences.shape

In [None]:
history = model.fit([q1, q2], y, batch_size=32, epochs=10, validation_split=0.1)


In [None]:
# ques1 = 
# data2

df_small
df_small["q_pair"] = df_small["question1"] +" EndOfQuestion1 "+ df_small["question2"]
df_small["q_pair"][0]

# New Strategy

In [None]:
tokenizer2 = Tokenizer()
tokenizer2.fit_on_texts(df_small["q_pair"])


In [None]:
q_pair_tokenized = tokenizer2.texts_to_sequences(df_small["q_pair"])
q_pair_tokenized

In [None]:
padded_q_pairs = pad_sequences(q_pair_tokenized, maxlen=80, padding="post")
padded_q_pairs

In [None]:
scaler=StandardScaler()
scaled_q_pairs=np.array([scaler.fit_transform(padded_q_pair.reshape(-1,1)).flatten() for padded_q_pair in padded_q_pairs])
# q1 = np.array([scaler.fit_transform(row.reshape(-1, 1)).flatten() for row in q1])


In [None]:
q_pair_lens = [len(qp) for qp in padded_q_pairs]
plt.hist(q_pair_lens, bins=100)
# plt.xlim((0,100))
plt.show()

In [None]:
from keras.layers import Flatten
qp_ip = Input(shape=(80,), name="qp_ip_layer")
qp_emb = Embedding(input_dim=(len(tokenizer2.word_counts)+2), output_dim=256, name="qp_emb_layer")(qp_ip)

qp_att = MultiHeadAttention(num_heads=8, key_dim=64, name="qp_mha_layer")(query=qp_emb, key=qp_emb, value=qp_emb)

qp_flatten=Flatten()(qp_att)

qp_dense1 = Dense(64, activation=relu)(qp_flatten)
qp_dense2 = Dense(1, activation=sigmoid)(qp_dense1)

qp_model = Model(inputs=[qp_ip], outputs=[qp_dense2], name = "qp_model")


qp_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
qp_model.summary()



In [None]:
plot_model(qp_model, show_shapes=True)

In [None]:
X=scaled_q_pairs
y=df_small["is_duplicate"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:

qp_model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1)

In [None]:
cv = CountVectorizer(max_features=4000)

vectorized_samples = cv.fit_transform(questions_list)

In [None]:
q1, q2 = np.vsplit(vectorized_samples.toarray(),2)

In [None]:
q1

In [None]:
temp1 = pd.DataFrame(q1, index=df.index)
temp2 = pd.DataFrame(q2, index=df.index)

In [None]:
final_df = pd.concat([temp1, temp2], axis=1)

In [None]:
final_df

In [None]:
final_df["is_duplicate"] = df["is_duplicate"]
final_df


In [None]:
X = final_df.drop(["is_duplicate"], axis=1)
y = final_df["is_duplicate"]
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

In [None]:
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
xgb = XGBClassifier()


In [None]:
xgb.fit(X_train.values, y_train.values)

In [None]:
y_pred2 = xgb.predict(X_test.values)
accuracy2 = accuracy_score(y_test, y_pred2)
accuracy2