In [71]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import operator
from itertools import islice

In [72]:
dataframe = pd.read_csv('./data/labelled_dataset.csv')
dataframe.head()

Unnamed: 0,Platform,Text,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Best Topic
0,Facebook,nowadays every thing seem increasing governanc...,0.872072,0.0,0.0,0.090702,0.0,0.0,1
1,Facebook,disagree high,0.056015,0.056168,0.055559,0.720877,0.055559,0.055822,4
2,Facebook,tayong magalala naniniwala isusuprise sir bbm ...,0.010428,0.010424,0.010568,0.418144,0.010423,0.540014,6
3,Facebook,ok yang nang bansa ganyan selfish fanatic blen...,0.13199,0.018559,0.018721,0.573137,0.018559,0.239034,4
4,Facebook,samasama tayong babaon,0.041692,0.041692,0.041819,0.542127,0.290956,0.041715,4


In [73]:
corpus = list(dataframe["Text"])

In [74]:
# from collections import Counter
# import pprint
# counts = dict(Counter(corpus))
# duplicates = {key:value for key, value in counts.items() if value > 1}
# pprint.pprint(duplicates)

In [75]:
row_col_names = [f"{i}" for i in range(len(corpus))]

In [76]:
def get_pairs_scores(matrix:list, type:str)->dict:
   
   pair_scores = {}
   res = []
   if type=='similarity':
      res = matrix.idxmax(axis='columns')
   else:
      res = matrix.idxmin(axis='columns')
   for index, column in enumerate(res):
      test = matrix.loc[matrix.index[index], column]
      
      if  test >= .9 and type=="similarity":
         continue
      elif test>=1:
         continue
         
      doc1_len = len(corpus[index].split(' ')) 
      doc2_len = len(corpus[int(column)].split(' '))
      # Exclude documents that are less than 6 and more than 30 characters
      if doc1_len<=6 or doc2_len <=6 or doc1_len>=30 or doc2_len >=30:
         continue
      if f"{column}:{index}" not in pair_scores.keys():
         pair_scores[f"{index}:{column}"] = test
   return pair_scores


def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
 
def show_sentence_pairs(pairs:list):
   for pair in pairs:
      doc_pair_index = pair[0].split(':')
      doc1_idx = int(doc_pair_index[0])
      doc2_idx = int(doc_pair_index[1])

      print(f"Sentence 1: {corpus[doc1_idx]} \n Sentence 2: {corpus[doc2_idx]}")
 

In [77]:
tfidf = TfidfVectorizer()

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(corpus)


# Generate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [78]:
similarity_matrix = pd.DataFrame(cosine_sim, index=row_col_names, columns=row_col_names )
for i in range(len(similarity_matrix)): 
    similarity_matrix.iat[i, i] = np.nan
similarity_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4686,4687,4688,4689,4690,4691,4692,4693,4694,4695
0,,0.0,0.0,0.127159,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.330497,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.097896,0.0,0.038161,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.019566,0.0,0.047418,0.0,0.0,0.0
3,0.127159,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150336,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.097896,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
pair_scores = get_pairs_scores(similarity_matrix, "similarity")
topPairs = dict(sorted(pair_scores.items(), key=operator.itemgetter(1),reverse=True))
topN_pairs = take(20, topPairs.items())
show_sentence_pairs(topN_pairs)      

Sentence 1: additional note tabloid start pilipino star nothing yet bulgar hataw title count broadsheet tabloid looking previous edition post check herehttpswwwredditcomuseryohannesburpcommentstqpcdfrontpageforbroadsheetsandtabloidthread 
 Sentence 2: additional note tabloid start bulgar yesterday people newspaper included edition yet hataw title count broadsheet tabloid looking previous edition post check herehttpswwwredditcomuseryohannesburpcommentstqpcdfrontpageforbroadsheetsandtabloidthread
Sentence 1: usually see number would say nice time nice 
 Sentence 2: usually would reply nice see isnt nice
Sentence 1: happy kaming minion importante nakabalik junior malacaang tahanan 
 Sentence 2: importante happy majority filipino people nakabalik marcos tahanan
Sentence 1: hahahaha sure kang government edi wala silang kurakot 
 Sentence 2: ayaw kurakot sinasabi edi wala silang kurakot gustong pang mangyari
Sentence 1: mahal mg kuryente niyo ate baka nakikikabit dyan mahal ganun kamahal kun

In [80]:
# Generate cosine similarity
cosine_distance = cosine_distances(tfidf_matrix, tfidf_matrix)
distance_matrix = pd.DataFrame(cosine_distance, index=row_col_names, columns=row_col_names )
for i in range(len(distance_matrix)): 
    distance_matrix.iat[i, i] = np.nan
distance_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4686,4687,4688,4689,4690,4691,4692,4693,4694,4695
0,,1.0,1.0,0.872841,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,0.669503,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,,1.0,0.902104,1.0,0.961839,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.980434,1.0,0.952582,1.0,1.0,1.0
3,0.872841,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.849664,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,0.902104,1.0,,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [87]:

pair_scores = get_pairs_scores(distance_matrix, "distance")
topPairs = dict(sorted(pair_scores.items(), key=operator.itemgetter(1),reverse=True))
topN_pairs = take(20, topPairs.items())
show_sentence_pairs(topN_pairs)      

Sentence 1: fish free tuna sound friendlier tuna business opposed unfish could construed attack core business 
 Sentence 2: government business enterprise food rice part national security
Sentence 1: haha true kc tatalino ngaw ngaw ngaw ngaw hahah nmn magets year date basic jusmi 
 Sentence 2: research nyang oras alam month date year date
Sentence 1: right confront dictator junior daughtertes supporter shenanigan pulled last election call hypocrisy doubt theyll get fired political belief dont interfere work ethic performance 
 Sentence 2: marcoss supporter dont even mean let alone economics work
Sentence 1: alam gaano karaming effort mabibigay pagtitipid pagtitiis nagsisipag parang nararating bwisit 
 Sentence 2: grabe dami anak tuwing nakikita naiirita bwisit
Sentence 1: discriminatory vaccine passport mandate utter disgrace comply 
 Sentence 2: tell bf eat dick lmao umno najib fucking disgrace
Sentence 1: check fmetf first metro etf work cheaper alternative term fee also go sun life 