In [39]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import json
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords  
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.linear_model import SGDClassifier
from collections import Counter
import nlpaug.augmenter.word as naw
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score

In [2]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

## Data Loading & Preprocessing

In [2]:
faq_file = open("./faq.txt").read()
data = json.loads(faq_file)
df = pd.json_normalize(data)

In [3]:
df.head()

Unnamed: 0,question,answer,found_duplicate
0,How do I change my password?,"After you have logged in, you can change your ...",False
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...,False
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ...",False
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...,False
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us...",False


In [4]:
df.drop(["found_duplicate"],axis=1,inplace=True)

In [5]:
df.isnull().sum()

question    0
answer      0
dtype: int64

In [6]:
df.head()

Unnamed: 0,question,answer
0,How do I change my password?,"After you have logged in, you can change your ..."
1,When will I receive my changed ATM PIN?,You will receive your new ATM PIN by post with...
2,Can I get my newly generated PIN online?,"No, for security reasons we send you your ATM ..."
3,How can I register for Autopay?,To register for Autopay: Step 1: Click on the ...
4,Can Chip Credit cards be used anywhere?,"Yes, your HDFC Bank Chip Credit card can be us..."


In [7]:
stop_words=stopwords.words('english')

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
def clean_data(text):
    text=text.lower() #lower the text
    text = re.sub(r'[^\w\s]', '', text) #remove irrelevant characters    
    text = text.split() #convert sentence to tokens
    text = [lemmatizer.lemmatize(word) for word in text] #lemmatization
    text = " ".join(text) #converting tokens to sentence
    return text

In [12]:
df["question"] = df["question"].apply(clean_data)

In [12]:
# model_type: word2vec, glove or fasttext
aug = naw.WordEmbsAug(
    model_type='glove', model_path="./glove.6B.100d.txt",
    action="substitute")

In [26]:
aug_data = {}
for ques,ans in tqdm(zip(df['question'],df['answer'])):
    for i in range(4):
        aug_data.update({aug.augment(ques):ans})

2236it [14:43,  2.53it/s]


In [57]:
aug_df = pd.DataFrame(aug_data.items(),columns=['question','answer'])
aug_df.head()

Unnamed: 0,question,answer
0,how cannot i change my password,"After you have logged in, you can change your ..."
1,how do got change my password,"After you have logged in, you can change your ..."
2,how do i change kids password,"After you have logged in, you can change your ..."
3,how things i change my password,"After you have logged in, you can change your ..."
4,when will i receive my changed pcmcia stripe,You will receive your new ATM PIN by post with...


In [61]:
final_df = pd.concat([df,aug_df])

In [94]:
final_df.to_csv("augmented.csv",index=False)

## Modelling

In [14]:
final_df = pd.read_csv("augmented.csv")

In [15]:
X = final_df['question']
y = final_df['answer']

In [16]:
le = LabelEncoder()

In [17]:
y = le.fit_transform(y)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=100,test_size=0.2,stratify=y)

In [19]:
tf = TfidfVectorizer(ngram_range=(1, 3),min_df=0,stop_words='english')
X_train_tf = tf.fit_transform(X_train)

In [20]:
X_test_tf = tf.transform(X_test)

In [21]:
model = SGDClassifier(n_jobs=-1,random_state=100,loss='modified_huber',alpha=0.0005)
model.fit(X_train_tf,y_train)

SGDClassifier(alpha=0.0005, loss='modified_huber', n_jobs=-1, random_state=100)

In [23]:
y_pred = model.predict(X_test_tf)

In [40]:
labels = np.unique(y_test)
ytest_prob = label_binarize(y_test, classes=labels)
ypred_prob = label_binarize(y_pred, classes=labels)

In [53]:
print("Accuracy Score:",accuracy_score(y_test,y_pred))
print("Precision Score:",precision_score(y_test,y_pred,average='micro'))
print("Recall Score:",recall_score(y_test,y_pred,average='micro'))
print("ROC-AUC Score:",roc_auc_score(ytest_prob,ypred_prob,multi_class='ovo',average='micro'))

Accuracy Score: 0.8125279642058165
Precision Score: 0.8125279642058165
Recall Score: 0.8125279642058165
ROC-AUC Score: 0.906220545579323


## Testing

In [38]:
idx = 7
print(f"Question: {X_test.iloc[idx]}")
print(f"\nPredicted Answer:\n{le.inverse_transform(model.predict(X_test_tf[idx]))[0]}")
print(f"\nActual Answer:\n{le.inverse_transform([y_test[idx]])[0]}")

Question: do i need a coapplicant for the loan education loan for indian education

Predicted Answer:
Yes a co applicant is required for all full time courses. Co-applicant could be Parent/ Guardian or Spouse/ Parent-in-law (if married)

Actual Answer:
Yes a co applicant is required for all full time courses. Co-applicant could be Parent/ Guardian or Spouse/ Parent-in-law (if married)


In [88]:
questn = "how to open new savings account"
clean_ques = clean_data(questn)
clean_ques = tf.transform([clean_ques])
print(f"Question: {questn}")
if np.amax(model.predict_proba(clean_ques))>0.1:
    print(f"\nPredicted Answer:\n{le.inverse_transform(model.predict(clean_ques))[0]}")
else:
    print(f"\nPredicted Answer:\n(Not sure about your question, This might help you):\n\n{le.inverse_transform(model.predict(clean_ques))[0]}")

Question: how to open new savings account

Predicted Answer:
In order to open a new Savings Account, simply walk into the nearest HDFC Bank and speak to a customer service executive. Remember to carry the following documents (original for verification and self-attested copies for submission):Identity ProofAddress ProofLatest passport size photographsClick here to see the List of valid identity/addres proof.
