# Text similarity
## Generating a matrix S(i, j) which gives the extent of similarity between i-th and j-th comment.

### Importing libraries

In [11]:
import pandas as pd
import numpy as np

### Reading dataset

In [4]:
data_raw1 = pd.read_csv('train.csv')

In [6]:
data_raw1.head()

Unnamed: 0.1,Unnamed: 0,description_x,description_y,ticker_x,ticker_y,same_security
0,0,first trust dow jones internet,first trust dj internet idx,FDN,FDN,True
1,1,schwab intl large company index etf,schwab strategic tr fundamental intl large co ...,FNDF,FNDF,True
2,2,vanguard small cap index adm,vanguard small-cap index fund inst,VSMAX,VSCIX,False
3,3,duke energy corp new com new isin #us4 sedol #...,duke energy corp new com new isin #us26441c204...,DUK,DUK,True
4,4,visa inc class a,visa inc.,V,V,True


### We use TfidfVectorizer function from scikit-learn, which transforms text to feature vectors that can be used as input to estimator.

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(min_df=1, stop_words='english')
tfidf_matrix = tf.fit_transform(data_raw1['description_y'])

### We will be using the Cosine Similarity to calculate a numeric quantity that denotes the similarity between two comments. Since we have used the TF-IDF Vectorizer, calculating the Dot Product will directly give me the Cosine Similarity Score

In [17]:
n, _ = pairwise_similarity.shape 

In [18]:
pairwise_similarity = tfidf_matrix*tfidf_matrix.T 
pairwise_similarity[np.arange(n), np.arange(n)] = 1.0

In [19]:
pairwise_similarity.toarray() 

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.07744336, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.07744336, 1.        , ..., 0.08567773, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.08567773, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

### We can also use sklearn’s linear_kernel instead of cosine_similarities since it is much faster.

In [20]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [22]:
n, _ = cosine_sim.shape                                                                                                                                                                                                                        

In [23]:
cosine_sim[np.arange(n), np.arange(n)] = 1.0

In [24]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.07744336, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.07744336, 1.        , ..., 0.08567773, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.08567773, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])