In [1]:
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import operator
from itertools import islice

In [2]:
dataframe = pd.read_csv('./data/labelled_dataset.csv')
dataframe.head()

Unnamed: 0,Platform,Raw,Text,Topic 1,Topic 2,Topic 3,Best Topic
0,Facebook,Why nowadays every thing seem to be increasin...,nowadays every thing seem increasing governanc...,0.960316,0.019696,0.019989,1
1,Facebook,I will have to disagree.. we’re not that high!!,disagree high,0.772866,0.114139,0.112995,1
2,Facebook,"Wag po tayong mag-alala. Naniniwala po ako, is...",tayong magalala naniniwala isusuprise sir bbm ...,0.021459,0.95755,0.020991,2
3,Facebook,Ok lang yang lahat naman nang bansa ganyan. Sa...,ok yang nang bansa ganyan selfish fanatic blen...,0.037489,0.92543,0.037081,2
4,Facebook,Sama-sama tayong BABAON muli.,samasama tayong babaon,0.083516,0.832983,0.083501,2


In [3]:
corpus = list(dataframe["Text"])
raw_corpus = list(dataframe["Raw"])

In [4]:
# from collections import Counter
# import pprint
# counts = dict(Counter(corpus))
# duplicates = {key:value for key, value in counts.items() if value > 1}
# pprint.pprint(duplicates)

In [5]:
row_col_names = [f"{i}" for i in range(len(corpus))]

In [6]:
def get_pairs_scores(matrix:list, type:str)->dict:
   
   pair_scores = {}
   res = []
   if type=='similarity':
      res = matrix.idxmax(axis='columns')
   else:
      res = matrix.idxmin(axis='columns')
      
   for index, column in enumerate(res):
      
      test = matrix.loc[matrix.index[index], column]
      
      if  test < .5 and test>=1:
         continue
         
      doc1_len = len(corpus[index].split(' ')) 
      doc2_len = len(corpus[int(column)].split(' '))
      # Exclude documents that are less than 4 and more than 30 characters
      if doc1_len<=3 or doc2_len <=3 or doc1_len>=8 or doc2_len >=8:
         continue
      index = matrix.index[index]
      if f"{column}:{index}" not in pair_scores.keys():
         pair_scores[f"{index}:{column}"] = test
   
   return pair_scores


def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))
 
def show_sentence_pairs(pairs:list):
   for pair in pairs:
      doc_pair_index = pair[0].split(':')
      doc1_idx = int(doc_pair_index[0])
      doc2_idx = int(doc_pair_index[1])

      print(f"Sentence 1: {raw_corpus[doc1_idx]} \n Sentence 2: {raw_corpus[doc2_idx]}")
      if type == "similarity":
         print(f"Similarity Score:{pair[1]}")
      else:
         print(f"Dissimilarity Score:{pair[1]}")
      print('-----------------------------------------------------------------')

      
 

In [7]:
n_topics = dataframe['Best Topic'].nunique()
topic_index_dict = {}
def get_indices_of_doc_with_similar_topic():
   dict_var = {}
   for i in range(n_topics):
      dict_var[i]=dataframe.index[dataframe['Best Topic']==i+1].tolist()
   return dict_var
topic_index_dict = get_indices_of_doc_with_similar_topic()


In [8]:
tfidf = TfidfVectorizer()

# Construct the TF-IDF matrix
tf_model = tfidf.fit(corpus)
tfidf_matrix = tf_model.transform(corpus)

# Generate cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [9]:
similarity_matrix = pd.DataFrame(cosine_sim, index=row_col_names, columns=row_col_names )
for i in range(len(similarity_matrix)): 
    similarity_matrix.iat[i, i] = np.nan
similarity_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4686,4687,4688,4689,4690,4691,4692,4693,4694,4695
0,,0.0,0.0,0.127159,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.330497,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,,0.0,0.097896,0.0,0.038161,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.019566,0.0,0.047418,0.0,0.0,0.0
3,0.127159,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150336,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.097896,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:

list_of_similarity_df_by_topics = []
def get_similarity_per_topics():
   out =  []
   for i in range(n_topics):
      df1 = similarity_matrix.iloc[:,list(topic_index_dict[i])]
      topicN_df = df1.iloc[list(topic_index_dict[i])]
      out.append(topicN_df)
   return out

list_of_similarity_df_by_topics = get_similarity_per_topics() 

In [11]:
# for i in range(len(list(topicN_df.iloc[:,0]))):
#    if list(topicN_df.iloc[:,0])[i] != list(topicN_df.iloc[:,0])[i]:
#       print(list(topicN_df.iloc[:,0])[i], list(topicN_df.iloc[:,0])[i])

In [12]:
for i in range(n_topics):
   print(f'Average Similarity Score for Topic {i+1}: {list_of_similarity_df_by_topics[i].iloc[:,0].mean()}')
print('')
for i in range(n_topics):
   print(f'Topic {i+1} | Documents {list_of_similarity_df_by_topics[i].shape[0]}')
   
   pair_scores = get_pairs_scores(list_of_similarity_df_by_topics[i] , "similarity")
   topPairs = dict(sorted(pair_scores.items(), key=operator.itemgetter(1), reverse=True))
   topN_pairs = take(5, topPairs.items())
   show_sentence_pairs(topN_pairs)
   print('\n')

Average Similarity Score for Topic 1: 0.010447497075982528
Average Similarity Score for Topic 2: 0.012150819435352966
Average Similarity Score for Topic 3: 0.011244444354340223

Topic 1 | Documents 1668
Sentence 1: I think that I will have to… I will have to disagree with that number. We are not that high. - 88M 
 Sentence 2: "I think I will have to disagree with that number. We are not that high"
Dissimilarity Score:1.0000000000000002
-----------------------------------------------------------------
Sentence 1: "I disagree with that number I think we're not that high" 
 Sentence 2: I think that I will have to… I will have to disagree with that number. We are not that high. - 88M
Dissimilarity Score:1.0000000000000002
-----------------------------------------------------------------
Sentence 1: I think I will have to disagree with that number 🥴 
 Sentence 2: I think that I will have to… I will have to disagree with that number. We are not that high. - 88M
Dissimilarity Score:0.89657988

In [13]:
# Generate cosine similarity
cosine_distance = cosine_distances(tfidf_matrix, tfidf_matrix)
distance_matrix = pd.DataFrame(cosine_distance, index=row_col_names, columns=row_col_names )
for i in range(len(distance_matrix)): 
    distance_matrix.iat[i, i] = np.nan
distance_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4686,4687,4688,4689,4690,4691,4692,4693,4694,4695
0,,1.0,1.0,0.872841,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,0.669503,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,,1.0,0.902104,1.0,0.961839,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.980434,1.0,0.952582,1.0,1.0,1.0
3,0.872841,1.0,1.0,,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.849664,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,0.902104,1.0,,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [14]:
list_of_dissimilarity_df_by_topics = []
def get_dissimilarity_per_topics():
   out =  []
   for i in range(n_topics):
      df1 = distance_matrix.iloc[:,list(topic_index_dict[i])]
      topicN_df = df1.iloc[list(topic_index_dict[i])]
      out.append(topicN_df)
   return out

list_of_dissimilarity_df_by_topics = get_dissimilarity_per_topics() 

In [15]:
for i in range(n_topics):
   print(f'Average Dissimilarity Score for Topic {i+1}: {list_of_dissimilarity_df_by_topics[i].iloc[:,0].mean()}')
print('')
for i in range(n_topics):
   print(f'Topic {i+1} | Documents {list_of_dissimilarity_df_by_topics[i].shape[0]}')
   
   pair_scores = get_pairs_scores(list_of_dissimilarity_df_by_topics[i] , "dissimilarity")
   topPairs = dict(sorted(pair_scores.items(), key=operator.itemgetter(1), reverse=True))
   topN_pairs = take(5, topPairs.items())
   show_sentence_pairs(topN_pairs)
   print('\n')

Average Dissimilarity Score for Topic 1: 0.9895525029240174
Average Dissimilarity Score for Topic 2: 0.9878491805646471
Average Dissimilarity Score for Topic 3: 0.9887555556456598

Topic 1 | Documents 1668
Sentence 1: Solution: Ipa-recompute ang inflation, maging 3% nalang. Haha.

Naalala ko si Ate Glo before, para hindi magmukhang kulang ng classrooms, pinadivide into 2 ang classroom requirement kasi yun assumption niya na 2 shifts per classroom sa lahat ng schools. Ito yung link

https://www.gmanetwork.com/news/topstories/nation/7405/public-schools-reveal-acute-classroom-shortage/story/ 
 Sentence 2: 6% per month is 70% per annum, dating 100 maging 170 in 12months...
Dissimilarity Score:0.9119746953976552
-----------------------------------------------------------------
Sentence 1: Inflation is growing bad, BBM just made it look like a sexy woman with growing bust and butt size roaming on the street looking for rich people, and he fell in love with that ugly bish. 
 Sentence 2: Too b

In [16]:
# res= topicN_df.idxmax(axis='columns')
# print(res)
# maxes = []
# test_shit = []
# for index, column in enumerate(res):
#    test_shit.append(int(column))
#    test = topicN_df.loc[topicN_df.index[index], column]
#    maxes.append(test)
# print(mean(maxes))
# print(len(maxes))


In [17]:

# pair_scores = get_pairs_scores(distance_matrix, "distance")
# topPairs = dict(sorted(pair_scores.items(), key=operator.itemgetter(1),reverse=True))
# topN_pairs = take(20, topPairs.items())
# show_sentence_pairs(topN_pairs)      