In [None]:
# !pip install sentence_transformers

In [None]:
import pandas as pd
import ast
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
TOPIC = ["Decriminalization of drugs","Housing affordability"]
DATA_PATH = ["BCpolitics.csv","CanadianPolitics.csv","CanadianPolitics.csv"]
alternative_phrases = {TOPIC[0]:[
    "Elimination of drug criminalization",
    "Drug decriminalization",
    "Removing criminal penalties for drugs",
    "Abolishing drug criminalization",
    "Drug policy reform",
    "Ending drug prohibition",
    "Legal relaxation on drug offenses",
    "Softening drug laws",
    "Drug offense decriminalization",
    "Reducing legal sanctions on drugs"], TOPIC[1]:[
    "cost-effective housing",
    "affordable living spaces",
    "economical residential options",
    "budget-friendly homes",
    "reasonably priced housing",
    "accessible housing costs",
    "inexpensive housing",
    "low-cost accommodations",
    "attainable home prices",
    "financially viable residences"
]}
MODEL_NAME = "bert-base-nli-mean-tokens" 


In [None]:
def append_two_columns(row):
  all_words = ['']
  if len(row['clear_text']) != 0:
    all_words = ast.literal_eval(row['clear_title'])
    all_words.extend(ast.literal_eval(row['clear_text']))
  elif len(row['clear_title']) != 0:
    all_words = ast.literal_eval(row['clear_title'])
  return ' '.join(all_words)
    


In [None]:
def prepare_topics(topic_key):
  topics_as_text = [topic_key]
  topics_as_text.extend(alternative_phrases.get(topic_key))
  return ' '.join(topics_as_text)

In [None]:
def get_top_similars(sentences_, sim_result_):
  similars = []
  related = [(i,v) for i,v in enumerate(sim_result_.tolist()[0])]
  for i,v in related:
    similars.append((i,v,sentences_[i]))
  similars = sorted(similars, key=lambda e: e[1],reverse=True)
  return pd.DataFrame(similars, columns =['index', 'score', 'text'])

In [None]:
def get_most_related(data_path: str, topic_id: int):
  print(f"TOPIC: {TOPIC[topic_id]}, DATASET: {data_path}")
  data = pd.read_csv(data_path)
  data[['clear_title','clear_text']] = data[['clear_title','clear_text']].fillna('')
  data[['title','selftext']] = data[['title','selftext']].fillna('')

  original_text_data = data.apply(lambda row: row['title']+'--->'+row['selftext'], axis=1)
  data.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1, inplace=True)
  data = data.apply(lambda row: append_two_columns(row), axis=1)
  titles = prepare_topics(TOPIC[topic_id])
  model = SentenceTransformer(MODEL_NAME) 
  sentences = [sentence for sentence in data]
  sentence_embeddings = model.encode(sentences) 
  topic_embeddings = model.encode(titles)
  sim_result = cosine_similarity(
    [topic_embeddings],
    sentence_embeddings
  )
  similarity_result = get_top_similars(sentences_=original_text_data, sim_result_=sim_result)
  return similarity_result

In [None]:
path_index = 0
topic_index = 1 
result = get_most_related(data_path = DATA_PATH[path_index], topic_id=topic_index)
result.to_csv(f'{TOPIC[topic_index]}_{DATA_PATH[path_index]}.csv', index=False)
result.head(10)

TOPIC: Housing affordability, DATASET: BCpolitics.csv


Unnamed: 0,index,score,text
0,750,0.669707,Cheapest place to have replacement lenses made...
1,144,0.612148,Good driving school in Surrey?--->I really wan...
2,202,0.591586,Bill 22 proposes big changes to strata act to ...
3,978,0.58042,FiberStream vs Novus home internet--->I'm curr...
4,408,0.570763,Our new steam clock looks pretty cool 😁--->
5,671,0.567823,Honest experience being a VPD officer?--->I wo...
6,298,0.566582,So I went to book the ferry. You have 2 paymen...
7,77,0.566337,Vancouver isn’t cheap. Which general employers...
8,523,0.565599,Are there any dispensaries cheaper than Canna ...
9,323,0.562051,Love this very kind note attached to my take o...


In [None]:
path_index = 0
topic_index = 0
result = get_most_related(data_path = DATA_PATH[path_index], topic_id=topic_index)
result.to_csv(f'{TOPIC[topic_index]}_{DATA_PATH[path_index]}.csv', index=False)
result.head(10)

TOPIC: Decriminalization of drugs, DATASET: BCpolitics.csv


Unnamed: 0,index,score,text
0,755,0.621606,BC to restrict sales of diabetes drug Ozempic ...
1,122,0.618532,A new sub for discussing BC Liquor Stores prod...
2,808,0.572872,Are there any active operating Medical Walk-In...
3,49,0.568906,Where did you have a good experience buying a ...
4,124,0.568865,Seeking Cost-Efficient PRP Treatments in Great...
5,299,0.560084,English beers and ciders--->The other thread o...
6,372,0.558445,Rhinoplasty--->I’ve done consultations with Dr...
7,520,0.55631,Coasters in Vancouver bars?--->This might seem...
8,633,0.553291,Vaughn Palmer: B.C. NDP squirms as Liberals pu...
9,268,0.552604,COVID Vaccine appointment--->My parents showed...
