## Dataset :
The data set contains question and it's corresponding answers.

## Step 1: Import libraries and files

* __Pandas__ : To get dataset in the form of dataframe
* __nltk__ : It is a NLP libraries which contains packages to make machines understand human language and reply to it with an appropriate response.

In [1]:
import pandas as pd
import numpy as np
import pickle
import operator
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split as tts
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder as LE
from sklearn.metrics.pairwise import cosine_similarity
import random
import nltk
from nltk.stem.lancaster import LancasterStemmer
from tfidfvectorgenerator import TfidfVectorGenerator
from doc2vecgenerator import Doc2VecGenerator
from sent2vecgenerator import Sent2VecGenerator
from bertgenerator import BertGenerator

## Step 2: Train the model

* First we'll tokenzie each word from the dataset.
* After we tokenize, we will start cleaning up the tokens by Lemmatizing, removing the stopwords and removing the punctuations. Lemmatizing is the process of converting a word into its root form. 
* For example, words, like run, ran and running all convey the same meaning and hence don’t need to be considered as different words, lemmatizing will reduce all the words to run. 
* Stopwords represent the most frequent words used in Natural Language such as ‘a’, ‘is’,’ ‘what’ etc which do not add any value to the capability of the text classifier, so we remove them as well.

In [None]:
class FaqEngine:
    def __init__(self, faqslist,type):
        self.faqslist = faqslist
        self.stemmer = LancasterStemmer()
        self.le = LE()
        self.vectorizers = {"tfidf":TfidfVectorGenerator(),
                            "doc2vec":Doc2VecGenerator(),
                            "bert":BertGenerator(),
                            "sent2vec":Sent2VecGenerator()}
        self.build_model(type)
        
        
    # This funtion will perform the cleaning and preprocessing on the data
    def cleanup(self, sentence):
        print('cleanup')
        word_tok = nltk.word_tokenize(sentence)
        stemmed_words = [self.stemmer.stem(w) for w in word_tok]
        return ' '.join(stemmed_words)
        
    # This funtion will train the model
    # Now we need to feed some information into the chatbot so that it can answer to our queries. 
    def build_model(self,type):
        
        print('build_model')
        
        self.vectorizer = self.vectorizers[type]#TfidfVectorizer(min_df=1, stop_words='english')   
        
        # Read the data from csv files
        dataframeslist = [pd.read_csv(csvfile).dropna() for csvfile in self.faqslist]
        self.data = pd.concat(dataframeslist,  ignore_index=True)
        self.questions = self.data['Question'].values[0]
                
        questions_cleaned = []
        for question in self.questions:
            questions_cleaned.append(self.cleanup(question)) 
            
        X = self.vectorizer.vectorize(questions_cleaned)
                 
        y = self.data['Class'].values.tolist()
        y = self.le.fit_transform(y)
         
        # Split the dataset into train and test dataset
        trainx, testx, trainy, testy = tts(X, y, test_size=.25, random_state=42)
        
        # Apply SVC algorithm on the dataset with linear kernel and fit the data
        self.model = SVC(kernel='linear')
        self.model.fit(trainx, trainy)
        print("SVC:", self.model.score(testx, testy))       
        
    
    # this funtion will take the user queries and answer it proper output using our trained model
    def query(self, usr):
        #print("User typed : " + usr)
        try:
            # Clean the query
            cleaned_usr = self.cleanup(usr)
            t_usr_array = self.vectorizer.query(cleaned_usr)
            prediction = self.model.predict(t_usr_array)[0]
            class_ = self.le.inverse_transform([prediction])[0]
            #print("Class " + class_)
            questionset = self.data[self.data['Class']==class_]
            
            #threshold = 0.7
            cos_sims = []
            for question in questionset['Question']:
                cleaned_question = self.cleanup(question)
                question_arr = self.vectorizer.query(cleaned_question)
                sims = cosine_similarity(question_arr, t_usr_array)
                #if sims > threshold:
                cos_sims.append(sims)
                
            print("scores " + str(cos_sims))                
            if len(cos_sims) > 0:
                ind = cos_sims.index(max(cos_sims)) 
                print(ind)
                print(questionset.index[ind])
                return self.data['Answer'][questionset.index[ind]]
        except Exception as e:
            print(e)
            return "Could not follow your question [" + usr + "], Try again"
    
if __name__ == "__main__":
    faqslist = ["faqs/Greetings.csv", "faqs/GSTFAQs.csv"]
    print('1')
    faqmodel = FaqEngine(faqslist,'bert')
    print('2')
    response = faqmodel.query("Hi")
#     print('1')
    print(response)


1
