In [None]:

import nltk
import pandas as pd
import numpy as np
from scipy import spatial
from sklearn import metrics
from nltk.stem import PorterStemmer
import gensim.models.keyedvectors as word2vec
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


def load_faq_csv(file_name):
    data = pd.read_csv(file_name)
    return data

# Create a model of word2vec using the gensim library
def load_word2vec(path):
    mod = word2vec.KeyedVectors.load_word2vec_format(path, binary=True,)
    return mod

# Now we converting the vector of word(Number) to its actual word
def convertVector_to_word(model):
    index2word_set = set(model.wv.index2word)
    return index2word_set

# Remove stop words like "I,this,that,these,and,the etc."
def remove_stopwords(words):
    #nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    # return " ".join(w for w in words if w not in stop_words)
    return [w for w in words if w not in stop_words]

def lowercase(words):
    # return " ".join(w.lower() for w in words)
    return [w.lower() for w in words]

# stemming of word
def stem(words):
    ps = PorterStemmer()
    # return " ".join(ps.stem(w) for w in words)
    return [ps.stem(w) for w in words]

def clean_text(sentence):
    words = str(sentence).split()
    lower_case_words = lowercase(words)
    stop_words_removed =remove_stopwords(lower_case_words)
    # stemmed_words=app.stem(stop_words_removed)
    return stop_words_removed

# here we will define the calculation of average word2vec for a sentence
def average_word2vec_sentence(sentences, model, num_features, index2word_set):
    l1=[]
    for sent in sentences:
        cleaned_words = clean_text(sent)
        feature_vector = np.zeros((num_features,), dtype='float32')
        n_words = 0
        for word in cleaned_words:
            if word in index2word_set:
                n_words += 1
                feature_vector = np.add(feature_vector, model[word])
            if (n_words > 0):
                feature_vector = np.divide(feature_vector, n_words)

        l1.append(feature_vector)
    return l1
# here we will define the calculation of average word2vec for a sentence
def average_word2vec_sentence1(sentence, model, num_features, index2word_set):
    cleaned_words = clean_text(sentence)
    feature_vector = np.zeros((num_features,), dtype='float32')
    n_words = 0
    for word in cleaned_words:
        if word in index2word_set:
            n_words += 1
            feature_vector = np.add(feature_vector, model[word])
        if (n_words > 0):
            feature_vector = np.divide(feature_vector, n_words)

    return feature_vector

#split the data into train and test
def split_data(data):
    X_train,X_test,y_train,y_test=train_test_split(data['Query'],data['Label'],random_state=42,test_size=0.5,shuffle=True)
    return X_train,X_test,y_train,y_test

def Logistic_Regression(X_train_tvf ,X_test_tvf,y_train,y_test):
    estimator=LogisticRegression()
    param_grid={'C':[0.01,0.05,0.1,0.5,1,5,10],'penalty':['l1','l2']}
    optimizer=GridSearchCV(estimator,param_grid,cv=10)
    optimizer.fit(X_train_tvf,y_train)
    predict=optimizer.best_estimator_.predict(X_test_tvf)
    return metrics.accuracy_score(y_test,predict),optimizer

