In [6]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string
import gensim
from gensim.models import word2vec
from scipy import spatial

ps = nltk.PorterStemmer()
path = '/home/dipanjana/Documents/Dipanjana_Git_Repository/'

train = pd.read_csv(path+'training_data.csv').fillna("")

In [7]:
train.head()

Unnamed: 0,id,defect_description
0,1,What is the story of Kohinoor (Koh-i-Noor) Dia...
1,2,How can I increase the speed of my internet co...
2,3,Why am I mentally very lonely? How can I solve...
3,4,"Which one dissolve in water quikly sugar, salt..."
4,5,Astrology: I am a Capricorn Sun Cap moon and c...


In [9]:
#Data Inspection

print('shape of training dataset : ',train.shape)
print('Checking for null value in training data set:')
print(train.isnull().sum())
train.info()

shape of training dataset :  (49998, 2)
Checking for null value in training data set:
id                    0
defect_description    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49998 entries, 0 to 49997
Data columns (total 2 columns):
id                    49998 non-null int64
defect_description    49998 non-null object
dtypes: int64(1), object(1)
memory usage: 781.3+ KB


In [12]:
train_raw = train['defect_description']
#stop words
stop_words = stopwords.words('english')
#special characters
spl_chr = string.punctuation

In [46]:
#cleaning, stemming and tokenising
def clean_text(text):
    no_spl_chr = "".join([char.lower() for char in text if char not in spl_chr])
    tokens = no_spl_chr.split(" ")
    clean_text = [ps.stem(word) for word in tokens if word not in stop_words if word !='']
    return clean_text

train_clean = train_raw.apply(lambda x : clean_text(x))

In [None]:
'''
Using word2vec model in gensim package to calculate the similarity between 2 words 
We need to calculate the average vector value for all the vectorised words in every sentence/document 
and use cosine similarity between avg vectors to determine document similarity
'''

In [17]:
#Creation of word2vector model and training the model with tokenised and cleansed training dataset
model = word2vec.Word2Vec(train_clean, size=100, window=20, min_count=1, workers=4)

#Explanation of the parameters used in the constructor:

#size: (default 100) The number of dimensions of the embedding, e.g. the length of the dense vector to 
#represent each token (word).
#window: (default 5) The maximum distance between a target word and words around the target word.
#min_count: (default 5) The minimum count of words to consider when training the model; words with 
#an occurrence less than this count will be ignored.
#workers: (default 3) The number of threads to use while training.

#vector representation of one of the words from the training dataset:
model.wv['kohinoor']

array([-1.2979914e-02,  3.6535244e-03, -2.6734071e-02, -4.3921307e-02,
       -2.0588426e-02,  2.0644244e-02, -4.2259563e-03,  1.4028470e-02,
        7.1124262e-03, -3.8279958e-02,  3.9743118e-02, -2.6937695e-02,
       -3.3586498e-02,  5.3166691e-03,  2.3543868e-02, -1.4319889e-02,
        4.9857190e-03,  3.7123801e-03, -7.9219015e-03,  1.0900964e-02,
       -9.1656521e-03, -2.5135420e-02,  3.3531435e-02,  6.6316091e-02,
       -1.6770314e-02, -8.8263005e-03,  3.5897262e-02, -1.2258850e-02,
        8.0916481e-03,  2.3369579e-02, -4.0296111e-02, -2.5278654e-02,
       -4.2867428e-03, -8.9658480e-03, -1.0313519e-02, -1.4585562e-02,
        2.0847188e-02, -2.4467865e-02, -1.8593414e-02, -2.5726909e-02,
        1.6120546e-03, -3.1268984e-02, -1.6398558e-02,  7.0673353e-03,
        2.0153495e-02, -2.3621753e-02,  4.0305022e-06, -4.5272581e-02,
       -1.5752541e-02,  6.4998108e-04, -4.3788143e-03, -8.8469619e-03,
        3.4244299e-02,  7.9168864e-03,  4.8053269e-03, -2.0750981e-02,
      

In [16]:
#index2word (list of str) – Words which correspond to the matrix.
index2word_set = set(model.wv.index2word)


In [22]:
#Calculate the average vector for all vectorised words in every sentence(defect description)

def avg_feature_vector(defect_description):
    num_features = 100
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in defect_description:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
    if (n_words > 0):
        feature_vec = np.divide(feature_vec, n_words)

    return feature_vec

In [25]:
avg_feature_vector(train_clean[1])

  # Remove the CWD from sys.path while we load stuff.


array([-0.32169887, -0.42934778, -0.21104205, -0.94115084,  0.2524186 ,
        0.19168007, -0.39490113,  0.13663305,  0.18983187, -0.90659934,
        0.58220816, -0.36028877, -0.7388689 ,  0.25014699,  0.31152955,
       -0.59569365, -0.4889638 ,  0.2954696 , -0.15302956,  0.55329806,
        0.15990186,  0.08576676,  0.355172  ,  1.3518986 ,  0.12727593,
       -0.1271322 ,  0.41205275, -0.08829061,  0.06324824,  0.47234502,
       -0.97016877, -0.29790625, -0.02492273, -0.2606108 ,  0.14869156,
       -0.22109549,  0.18118125, -0.71295744, -0.49990085, -0.27533206,
       -0.4603301 , -0.17204958, -0.12790307,  0.18196237,  0.66885024,
       -0.59423625, -0.05547282, -0.5713847 , -0.05537507,  0.24995409,
       -0.4522774 , -0.09272481,  0.10080782,  0.46552718, -0.27209938,
       -0.34714973, -0.52180874, -0.28591922, -0.09429691, -0.1925682 ,
        0.0106052 , -0.24283779, -0.01261541,  0.00217813,  0.00761269,
        0.7614805 ,  0.23104717, -0.17226474,  0.01187406, -0.03

In [26]:
#Creating a list of vectorised defect descriptions for all the records of the training set

vectorised_description_list = train_clean.apply(lambda x : avg_feature_vector(x))


  # Remove the CWD from sys.path while we load stuff.


In [70]:
#Function to calculate similarity between existing defects and the new defect and 
#return top 5 similarity values with corresponding defect id(defect id = train['id'])

def similar_defects(new_defect):
    #cleaning and tokenising new defect
    cleaned = clean_text(new_defect)
    
    #vectorising tokenised defect
    vectorised_new = avg_feature_vector(cleaned)
    
    similarity_score = []
    similarity_score = [(1 - spatial.distance.cosine(entry, vectorised_new)) for entry in vectorised_description_list]
    #Note: formula used by the cosine function of the spatial class of scipy is (1 - cosine similarity thus the above tweak)
        
    #Dataframe containing ids of existing defects and similarity score against the new defect
    df_similarity = pd.DataFrame(similarity_score, index=train['id'], columns=['similarity score'])
    
    #Sorting result as per similarity score and outputing top 5 most similar defect ids
    sorted_df = df_similarity.sort_values(by='similarity score', ascending = False)
    
    return sorted_df[:5]

