In [612]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import operator
from itertools import islice

In [613]:
dataframe = pd.read_csv('./data/labelled_dataset.csv')
dataframe.head()

Unnamed: 0,Platform,Raw,Text,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Best Topic
0,Facebook,Why nowadays every thing seem to be increasin...,nowadays every thing seem increasing governanc...,0.473528,0.0,0.373507,0.0,0.0,0.124946,1
1,Facebook,I will have to disagree.. we’re not that high!!,disagree high,0.056182,0.055559,0.056438,0.720307,0.055583,0.055932,4
2,Facebook,"Wag po tayong mag-alala. Naniniwala po ako, is...",tayong magalala naniniwala isusuprise sir bbm ...,0.01047,0.139007,0.314224,0.247254,0.010458,0.278587,3
3,Facebook,Ok lang yang lahat naman nang bansa ganyan. Sa...,ok yang nang bansa ganyan selfish fanatic blen...,0.018524,0.018563,0.018567,0.618591,0.018527,0.307228,4
4,Facebook,Sama-sama tayong BABAON muli.,samasama tayong babaon,0.041687,0.041688,0.041687,0.791544,0.041687,0.041706,4


In [614]:
corpus = list(dataframe["Text"])
raw_corpus = list(dataframe["Raw"])

In [615]:
# from collections import Counter
# import pprint
# counts = dict(Counter(corpus))
# duplicates = {key:value for key, value in counts.items() if value > 1}
# pprint.pprint(duplicates)

In [616]:
row_col_names = [f"{i}" for i in range(len(corpus))]

In [617]:
def get_pairs_scores(matrix:list, type:str)->dict:
   
   pair_scores = {}
   res = []
   if type=='similarity':
      res = matrix.idxmax(axis='columns')
   else:
      res = matrix.idxmin(axis='columns')
      
   for index, column in enumerate(res):
      
      test = matrix.loc[matrix.index[index], column]
      
      if  test < .5 and type=="similarity":
         continue
      elif test>=1:
         continue
         
      doc1_len = len(raw_corpus[index].split(' ')) 
      doc2_len = len(raw_corpus[int(column)].split(' '))
      # Exclude documents that are less than 4 and more than 30 characters
      if doc1_len<=6 or doc2_len <=6 or doc1_len>=30 or doc2_len >=30:
         continue
      index = matrix.index[index]
      if f"{column}:{index}" not in pair_scores.keys():
         pair_scores[f"{index}:{column}"] = test
   
   return pair_scores


def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
 
def show_sentence_pairs(pairs:list):
   for pair in pairs:
      doc_pair_index = pair[0].split(':')
      doc1_idx = int(doc_pair_index[0])
      doc2_idx = int(doc_pair_index[1])

      print(f"Sentence 1: {raw_corpus[doc1_idx]} \n Sentence 2: {raw_corpus[doc2_idx]}")
      print(f"Similarity Score:{pair[1]}")
      print('-----------------------------------------------------------------')

      
 

In [618]:
n_topics = dataframe['Best Topic'].nunique()
topic_index_dict = {}
def get_indices_of_doc_with_similar_topic():
   dict_var = {}
   for i in range(n_topics):
      dict_var[i]=dataframe.index[dataframe['Best Topic']==i+1].tolist()
   return dict_var
topic_index_dict = get_indices_of_doc_with_similar_topic()


In [619]:
tfidf = TfidfVectorizer()

# Construct the TF-IDF matrix
tf_model = tfidf.fit(corpus)
tfidf_matrix = tf_model.transform(corpus)

# Generate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [620]:
similarity_matrix = pd.DataFrame(cosine_sim, index=row_col_names, columns=row_col_names )
for i in range(len(similarity_matrix)): 
    similarity_matrix.iat[i, i] = np.nan
similarity_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4686,4687,4688,4689,4690,4691,4692,4693,4694,4695
0,,0.0,0.0,0.127159,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.330497,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.097896,0.0,0.038161,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.019566,0.0,0.047418,0.0,0.0,0.0
3,0.127159,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150336,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.097896,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [621]:

list_of_similarity_df_by_topics = []
def get_similarity_per_topics():
   out =  []
   for i in range(n_topics):
      df1 = similarity_matrix.iloc[:,list(topic_index_dict[i])]
      topicN_df = df1.iloc[list(topic_index_dict[i])]
      out.append(topicN_df)
   return out

list_of_similarity_df_by_topics = get_similarity_per_topics() 

In [622]:
# for i in range(len(list(topicN_df.iloc[:,0]))):
#    if list(topicN_df.iloc[:,0])[i] != list(topicN_df.iloc[:,0])[i]:
#       print(list(topicN_df.iloc[:,0])[i], list(topicN_df.iloc[:,0])[i])

In [623]:
for i in range(n_topics):
   print(f'Average Similarity Score for Topic {i+1}: {list_of_similarity_df_by_topics[i].iloc[:,0].mean()}')

Average Similarity Score for Topic 1: 0.011550078742752259
Average Similarity Score for Topic 2: 0.005862678270050148
Average Similarity Score for Topic 3: 0.008277752102565747
Average Similarity Score for Topic 4: 0.014220825364119692
Average Similarity Score for Topic 5: 0.002798765632599675
Average Similarity Score for Topic 6: 0.005319776602263192


In [624]:
for i in range(n_topics):
   print(f'Topic {i+1} | Documents {list_of_similarity_df_by_topics[i].shape[0]}')
   
   pair_scores = get_pairs_scores(list_of_similarity_df_by_topics[i] , "similarity")
   topPairs = dict(sorted(pair_scores.items(), key=operator.itemgetter(1), reverse=True))
   topN_pairs = take(5, topPairs.items())
   show_sentence_pairs(topN_pairs)
   print('\n')

Topic 1 | Documents 881
Sentence 1: Perception is real. Truth is not. 
 Sentence 2: Perception is real, truth is not for 6 yrs ba >_<
Similarity Score:0.8668521944523396
-----------------------------------------------------------------
Sentence 1: Enjoy the Vacation! 
 Sentence 2: Enjoy your vacation, see you September 1
Similarity Score:0.7863419979890762
-----------------------------------------------------------------
Sentence 1: Thank you for educating us po.❤️ 
 Sentence 2: Thank you for educating. Very clear explaination and on point po.
Similarity Score:0.6619169066255127
-----------------------------------------------------------------
Sentence 1: convert your money in dollars 
 Sentence 2: Convert your savings into the hardest money.
Similarity Score:0.5536153603698053
-----------------------------------------------------------------
Sentence 1: More interest rate hikes please 
 Sentence 2: If what you are saying is true that our inflation rate is different from other countrie

In [625]:
# Generate cosine similarity
cosine_distance = cosine_distances(tfidf_matrix, tfidf_matrix)
distance_matrix = pd.DataFrame(cosine_distance, index=row_col_names, columns=row_col_names )
for i in range(len(distance_matrix)): 
    distance_matrix.iat[i, i] = np.nan
distance_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4686,4687,4688,4689,4690,4691,4692,4693,4694,4695
0,,1.0,1.0,0.872841,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,0.669503,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,,1.0,0.902104,1.0,0.961839,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.980434,1.0,0.952582,1.0,1.0,1.0
3,0.872841,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.849664,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,0.902104,1.0,,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [626]:
# res= topicN_df.idxmax(axis='columns')
# print(res)
# maxes = []
# test_shit = []
# for index, column in enumerate(res):
#    test_shit.append(int(column))
#    test = topicN_df.loc[topicN_df.index[index], column]
#    maxes.append(test)
# print(mean(maxes))
# print(len(maxes))


In [627]:

# pair_scores = get_pairs_scores(distance_matrix, "distance")
# topPairs = dict(sorted(pair_scores.items(), key=operator.itemgetter(1),reverse=True))
# topN_pairs = take(20, topPairs.items())
# show_sentence_pairs(topN_pairs)      