# Deep Learning for NLP

In [None]:
import gensim
from gensim.models import Word2Vec
import numpy as np 
import nltk
import itertools
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy 
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')   


In [None]:
# Randomly taking sentences from internet 

Doc1 = ["With the Union cabinet approving the amendments to the Motor Vehicles Act, 2016, those caught for drunken driving will have to have really deep pockets, as the fine payable in court has been enhanced to Rs 10,000 for first-time offenders." ] 
     
Doc2 = ["Natural language processing (NLP) is an area of computer science and artificial intelligence concerned with the interactions between computers and human (natural) languages, in particular how to program computers to process and analyze large amounts of natural language data."]

Doc3 = ["He points out that public transport is very good in Mumbai and New Delhi, where there is a good network of suburban and metro rail systems."]

Doc4 = ["But the man behind the wickets at the other end was watching just as keenly. With an affirmative nod from Dhoni, India captain Rohit Sharma promptly asked for a review. Sure enough, the ball would have clipped the top of middle and leg."]

# Put all the documents in one list

fin= Doc1+Doc2+Doc3+Doc4


In [None]:
#https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit

#load the model

model = gensim.models.KeyedVectors.load_word2vec_format('/GoogleNews-vectors-negative300.bin', binary=True)

#Preprocessing 

def remove_stopwords(text, is_lower_case=False):
    pattern = r'[^a-zA-z0-9\s]' 
    text = re.sub(pattern, '', ''.join(text))
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

# Function to get the embedding vector for n dimension, we have used "300"

def get_embedding(word):
    if word in model.wv.vocab:
        return model[x]
    else:
        return np.zeros(300)


In [None]:
# Getting average vector for each document 
out_dict =  {}
for sen in fin:
    average_vector = (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(remove_stopwords(sen))]), axis=0))
    dict = { sen : (average_vector) }
    out_dict.update(dict)

# Function to calculate the similarity between the query vector and document vector

def get_sim(query_embedding, average_vector_doc):
    sim = [(1 - scipy.spatial.distance.cosine(query_embedding, average_vector_doc))]
    return sim

# Rank all the documents based on the similarity to get relevant docs

def Ranked_documents(query):
    query_words =  (np.mean(np.array([get_embedding(x) for x in nltk.word_tokenize(query.lower())],dtype=float), axis=0))
    rank = []
    for k,v in out_dict.items():
        rank.append((k, get_sim(query_words, v)))
    rank = sorted(rank,key=lambda t: t[1], reverse=True)
    print('Ranked Documents :')
    return rank


In [None]:
# Call the IR function with a query

Ranked_documents("cricket")


In [None]:
#Let’s take one more example as may be driving. 

Ranked_documents("driving")

In [None]:
#read file
file_content = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

#check sample content in the email
file_content['v2'][1]

#Import library
from nltk.corpus import stopwords
from nltk import *
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Remove stop words
stop = stopwords.words('english')
file_content['v2'] = file_content['v2'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

# Delete unwanted columns
Email_Data = file_content[['v1', 'v2']]

# Rename column names
Email_Data = Email_Data.rename(columns={"v1":"Target", "v2":"Email"})
Email_Data.head()

#Delete punctuations, convert text in lower case and delete the double space 

Email_Data['Email'] = Email_Data['Email'].apply(lambda x: re.sub('[!@#$:).;,?&]', '', x.lower()))
Email_Data['Email'] = Email_Data['Email'].apply(lambda x: re.sub(' ', ' ', x))
Email_Data['Email'].head(5)


In [None]:
#Separating text(input) and target classes

list_sentences_rawdata = Email_Data["Email"].fillna("_na_").values
list_classes = ["Target"]
target = Email_Data[list_classes].values


To_Process=Email_Data[['Email', 'Target']]


In [None]:
#Train and test split with 80:20 ratio
train, test = train_test_split(To_Process, test_size=0.2) 

# Define the sequence lengths, max number of words and embedding dimensions
# Sequence length of each sentence. If more, truncate. If less, pad with zeros

MAX_SEQUENCE_LENGTH = 300 

# Top 20000 frequently occurring words
MAX_NB_WORDS = 20000 
 
# Get the frequently occurring words
 tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 
tokenizer.fit_on_texts(train.Email) 
train_sequences = tokenizer.texts_to_sequences(train.Email)
test_sequences = tokenizer.texts_to_sequences(test.Email)

# dictionary containing words and their index
word_index = tokenizer.word_index 
# print(tokenizer.word_index) 
# total words in the corpus
print('Found %s unique tokens.' % len(word_index)) 

# get only the top frequent words on train
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH) 

# get only the top frequent words on test
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH) 

print(train_data.shape)
print(test_data.shape)


In [None]:
train_labels = train['Target']
test_labels = test['Target']

#import library

from sklearn.preprocessing import LabelEncoder
# converts the character array to numeric array. Assigns levels to unique labels.

le = LabelEncoder() 
le.fit(train_labels)
train_labels = le.transform(train_labels)
test_labels = le.transform(test_labels)

print(le.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))


In [None]:
# changing data types
labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)

EMBEDDING_DIM = 100
print(MAX_SEQUENCE_LENGTH)


In [None]:
# Import Libraries 
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D, SimpleRNN
from keras.models import Model
from keras.models import Sequential
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Sequential



In [None]:
print('Training CNN 1D model.')

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
 EMBEDDING_DIM,
 input_length=MAX_SEQUENCE_LENGTH
 ))
model.add(Dropout(0.5))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(2, activation='softmax'))


model.compile(loss='categorical_crossentropy',
 optimizer='rmsprop',
 metrics=['acc'])

model.fit(train_data, labels_train,
 batch_size=64,
 epochs=5,
 validation_data=(test_data, labels_test))


In [None]:
#predictions on test data

predicted=model.predict(test_data)
predicted

#model evaluation

import sklearn
from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

print("############################")

print(sklearn.metrics.classification_report(labels_test, predicted.round()))


In [None]:
#import library
from keras.layers.recurrent import SimpleRNN

#model training

print('Training SIMPLERNN model.')

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
 EMBEDDING_DIM,
 input_length=MAX_SEQUENCE_LENGTH
 ))
model.add(SimpleRNN(2, input_shape=(None,1)))

model.add(Dense(2,activation='softmax'))

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

model.fit(train_data, labels_train,
 batch_size=16,
 epochs=5,
 validation_data=(test_data, labels_test))


In [None]:
# prediction on test data
predicted_Srnn=model.predict(test_data)
predicted_Srnn

#model evaluation

from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted_Srnn.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

print("############################")

print(sklearn.metrics.classification_report(labels_test, predicted_Srnn.round()))


In [None]:
#model training

print('Training LSTM model.')

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
 EMBEDDING_DIM,
 input_length=MAX_SEQUENCE_LENGTH
 ))
model.add(LSTM(output_dim=16, activation='relu', inner_activation='hard_sigmoid',return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Flatten())

model.add(Dense(2,activation='softmax'))

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

model.fit(train_data, labels_train,
 batch_size=16,
 epochs=5,
 validation_data=(test_data, labels_test))


In [None]:
#prediction on text data
predicted_lstm=model.predict(test_data)
predicted_lstm

#model evaluation 

from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted_lstm.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

print("############################")

print(sklearn.metrics.classification_report(labels_test, predicted_lstm.round()))


In [None]:
#model training

print('Training Bidirectional LSTM model.')

model = Sequential()
model.add(Embedding(MAX_NB_WORDS,
 EMBEDDING_DIM,
 input_length=MAX_SEQUENCE_LENGTH
 ))
model.add(Bidirectional(LSTM(16, return_sequences=True, dropout=0.1, recurrent_dropout=0.1)))
model.add(Conv1D(16, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform"))
model.add(GlobalMaxPool1D())
model.add(Dense(50, activation="relu"))
model.add(Dropout(0.1))

model.add(Dense(2,activation='softmax'))

model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

model.fit(train_data, labels_train,
 batch_size=16,
 epochs=3,
 validation_data=(test_data, labels_test))


In [None]:
# prediction on test data

predicted_blstm=model.predict(test_data)
predicted_blstm

#model evaluation

from sklearn.metrics import precision_recall_fscore_support as score

precision, recall, fscore, support = score(labels_test, predicted_blstm.round())

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))

print("############################")

print(sklearn.metrics.classification_report(labels_test, predicted_blstm.round()))


In [None]:
file_content = pd.read_csv('spam.csv', encoding = "ISO-8859-1")

# Just selecting emails and connverting it into list
Email_Data = file_content[[ 'v2']]

list_data = Email_Data.values.tolist()
list_data 


In [None]:
import numpy as np
import random
import pandas as pd
import sys
import os
import time
import codecs
import collections
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from nltk.tokenize import sent_tokenize, word_tokenize
import scipy 
from scipy import spatial
from nltk.tokenize.toktok import ToktokTokenizer
import re
tokenizer = ToktokTokenizer()


In [None]:
#Converting list to string
from collections import Iterable


def flatten(items):
    """Yield items from any nested iterable"""
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            for sub_x in flatten(x):
                yield sub_x
        else:
            yield x


TextData=list(flatten(list_data))  
TextData = ''.join(TextData) 

# Remove unwanted lines and converting into lower case
TextData = TextData.replace('\n','')
TextData = TextData.lower() 

pattern = r'[^a-zA-z0-9\s]' 
TextData = re.sub(pattern, '', ''.join(TextData)) 

# Tokenizing

tokens = tokenizer.tokenize(TextData)
tokens = [token.strip() for token in tokens] 

# get the distinct words and sort it

word_counts = collections.Counter(tokens)
word_c = len(word_counts)
print(word_c)

distinct_words = [x[0] for x in word_counts.most_common()]
distinct_words_sorted = list(sorted(distinct_words)) 


# Generate indexing for all words

word_index = {x: i for i, x in enumerate(distinct_words_sorted)} 


# decide on sentence lenght

sentence_length = 25


In [None]:
#prepare the dataset of input to output pairs encoded as integers
# Generate the data for the model

#input = the input sentence to the model with index 
#output = output of the model with index

InputData = []
OutputData = []

for i in range(0, word_c - sentence_length, 1):
    X = tokens[i:i + sentence_length]
    Y = tokens[i + sentence_length]
    InputData.append([word_index[char] for char in X])
    OutputData.append(word_index[Y])

print (InputData[:1])
print ("\n")
print(OutputData[:1]) 


In [None]:
# Generate  X 
X = numpy.reshape(InputData, (len(InputData), sentence_length, 1))


# One hot encode the output variable
Y = np_utils.to_categorical(OutputData) 

Y


In [None]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(Y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

 
#define the checkpoint
file_name_path="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(file_name_path, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks = [checkpoint] 

#fit the model
model.fit(X, Y, epochs=5, batch_size=128, callbacks=callbacks) 


In [None]:
# load the network weights
file_name = "weights-improvement-05-6.8213.hdf5"
model.load_weights(file_name)
model.compile(loss='categorical_crossentropy', optimizer='adam') 


In [None]:
# Generating random sequence
start = numpy.random.randint(0, len(InputData))
input_sent = InputData[start]

# Generate index of the next word of the email 

X = numpy.reshape(input_sent, (1, len(input_sent), 1))
predict_word = model.predict(X, verbose=0)
index = numpy.argmax(predict_word)

print(input_sent)
print ("\n")
print(index)


In [None]:
# Convert these indexes back to words

word_index_rev = dict((i, c) for i, c in enumerate(tokens))
result = word_index_rev[index]
sent_in = [word_index_rev[value] for value in input_sent]

print(sent_in)
print ("\n")
print(result)


Recipe 6-4. Stackoverflow question recommendation

Raw Data set link: 
https://www.kaggle.com/c/predict-closed-questions-on-stack-overflow/data?select=train-sample.csv


In [None]:
#importing necessary liabraries
import pandas as pd
import numpy as np
import pickle
import time

import re
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import ToktokTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

from string import punctuation
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix, hstack,csr_matrix


#importing training data
df=pd.read_csv('train-sample.csv')

#counting all null values

df.isnull().sum()

#Replacing missing values with blank space

df['Tag1']=df['Tag1'].replace(np.NaN,'')
df['Tag2']=df['Tag2'].replace(np.NaN,'')
df['Tag3']=df['Tag3'].replace(np.NaN,'')
df['Tag4']=df['Tag4'].replace(np.NaN,'')
df['Tag5']=df['Tag5'].replace(np.NaN,'')

#converting column type into string 

df['Title']=df['Title'].astype(str)  
df['BodyMarkdown']=df['BodyMarkdown'].astype(str) 

#checking top 10 most common words from the Body column

from collections import Counter
cnt = Counter()
for text in df["BodyMarkdown"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)  #top 10 common words


In [None]:
#importing tokenizer for sentence tokenization
token=ToktokTokenizer()


#stop words removing function 
def stopWords(text):
    
    
    stop_words = set(stopwords.words("english"))   #importing stopwords dictionary
    #text = re.sub('[^a-zA-Z]', ' ', text)
    words=token.tokenize(text)                   # tokenizing sentences
    
    
    filtered = [w for w in words if not w in stop_words] #filtering words which are not in stopwords
    
    return ' '.join(map(str, filtered))  #creating string combining all filtered words

#function to remove punctuations
def remove_punctuations(text):    
    punct = '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'   #list of punctuation marks
    for punctuation in punct:
        text = text.replace(punctuation, '')   #replacing punctuation mark with blank space
    return text

#function to remove frequent words but they were mostly from stopwords 

FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])
def remove_freqwords(text):     
 
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])

    
#cleaning the text 
   
def clean_text(text):    
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"\'ve", " have ", text)x`
    text = re.sub(r"can't", "can not ", text)
    text = text.strip(' ')
    return text

#URL removing function
def remove_urls(text):  
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

#function to remove html tag and replacing with blank space
def remove_html(text):  
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)


#Applying all preprocessing steps defined above on both Body 

df['BodyMarkdown']=df['BodyMarkdown'].apply(lambda x: clean_text(x))
df['BodyMarkdown'] = df['BodyMarkdown'].apply(remove_punctuations)
df['BodyMarkdown'] = df['BodyMarkdown'].apply(remove_urls)
df['BodyMarkdown'] = df['BodyMarkdown'].apply(remove_html)
df['BodyMarkdown'] = df['BodyMarkdown'].apply(lambda x:stopWords(x))


In [None]:
#Importing TFIDF vector as tfidf_vectorizer 

tfidf_vectorizer = TfidfVectorizer()

#applying tfidf on Body  column

tfidf_matrix2 = tfidf_vectorizer.fit_transform(df['BodyMarkdown'])


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd

In [None]:
#creating sample data set with 100 rows for testing. Comment this line to run it on the whole dataset.

dfg=df.iloc[0:100,:]

# load the glove model 

glove_model = pd.read_table("glove.6B.100d.txt", sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)


# getting mean vector for each sentence

def get_mean_vector(glove_model, words):
    # remove out-of-vocabulary words
    words = [word for word in word_tokenize(words) if word in list(glove_model.index)] #if word is in vocab 
    if len(words) >= 1:
        return np.mean(glove_model.loc[words].values, axis=0)
    else:
        return np.array([0]*100)


#Defining empty list and appending array to the list

glove_embeddings=[]                                     
for i in dfg.BodyMarkdown:
    glove_embeddings.append(list(get_mean_vector(glove_model, i)))    

glove_embeddings_t=pd.DataFrame(K1).transpose()    
glove_embeddings_t.to_csv('glove-vec.csv')

#Loading our pre-trained vectors of each abstract

K=pd.read_csv('glove-vec.csv')  
glove_embeddings_loaded=[]                          

#transforming data frame into a required array-#like structure as we did in the above step

for i in range(dfg.shape[0]):
    glove_embeddings_loaded.append(K[str(i)].values)
glove_embeddings_loaded=np.asarray(glove_embeddings_loaded)    


In [None]:
#GPT

!pip install pytorch_pretrained_bert

# importing necessary libraries for GPT

import torch
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel
 
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTModel.from_pretrained('openai-gpt')
model.eval()
print('Model Loaded')


#function to get embedding of each token
def returnEmbedding(pSentence):
  tokens = pSentence.split(' ')
  hidden_states = np.zeros((1,768))
  for token in tokens:
      subwords = tokenizer.tokenize(token)
      indexed_tokens = tokenizer.convert_tokens_to_ids(subwords)
      tokens_tensor = torch.tensor([indexed_tokens])

      with torch.no_grad():
          try:
            hidden_states += np.array(torch.mean(model(tokens_tensor),1))
          except Exception as ex:
            continue
  hidden_states /= len(tokens)
  return hidden_states


# Initialize Matrix with number of dataset records as rows and 768 columns as embedding dimension
X = np.zeros((df_gpt.shape[0], 768))

# Generate sentence level embedding by calculating average of all word embedding
for iter in range(df_gpt.shape[0]):
    text = df_gpt.loc[iter,'BodyMarkdown']
    #print(iter)
    X[iter] = returnEmbedding(text)

embeddings_GPT = X


In [None]:
# BERT

# Install BERT sentence transformer for sentence encoding
!pip install sentence-transformers

#running on 100 rows only for testing. Later comment this line
df_bert=df.iloc[0:100,:] 


#importing bert-base model

from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

#embeding on Body column
sentence_embeddings = sbert_model.encode(df['BodyMarkdown'])
print('Sample BERT embedding vector - length', len(sentence_embeddings[0]))


In [None]:
#defining function to derive cosine similarity
from numpy import dot
from numpy.linalg import norm
def cos_sim(a,b):

    return dot(a, b)/(norm(a)*norm(b)) 



#Function which returns Top N similar sentence from data frame directly

def top_n(user,p,df):    
    
    #Converting cosine similarities of overall data set with input queries into LIST
    x=cosine_similarity(user,p).tolist()[0]
    
 #store list in temp file to retrieve index
    tmp=list(x)
    
 #sort the list 
    x.sort(reverse=True)
 
    print( x[0:5])

    
 #get index of top 5
    L=[]
    for i in x[0:5]:
    
        L.append(tmp.index(i))
    return df.iloc[L, [6,7]]
    
    

In [None]:
#function to pre-process and extract embeddings for the user input text 

def user_transform(query,model):    
    query= clean_text(query)
    query= remove_punctuations(query)
    query= remove_urls(query)
    query= remove_html(query)
    query= stopWords(query)
    print(query)
    if model=='TFIDF':
      k=tfidf_vectorizer.transform([str(query)])
    elif model=='BERT':
      k=sbert_model.encode(str(query))
    elif model=='glove_model':
      k=get_mean_vector(glove_model,query)
      k=k.reshape(1,-1)
    elif model=='GPT':
      k=returnEmbedding(query)  

    return k

    
 pd.set_option("display.max_colwidth", -1)       #this function will display full text from each column


In [None]:
input=user_transform('do we have any other Q&A platform like stackoverflow which is free source?','TFIDF')   

top_n(input,tfidf_matrix2,df)  

In [None]:
# Getting top 5 similar questions using Glove model 

input=user_transform('do we have any other Q&A platform like stackoverflow which is free source?','glove_model')   #query

top_n(input,glove_embeddings_loaded,df) 


In [None]:

#similar questions from GPT (from 100 rows)

input=user_transform('do we have any other Q&A platform like stackoverflow which is free source?','GPT')   #query

top_n(input,embeddings_GPT,df)  


In [None]:
#similar questions from BERT 

input=user_transform('do we have any other Q&A platform like stackoverflow which is free source?','BERT')   #query

top_n(input,sentence_embeddings,df)  
