# Getting Relevant Posts and Topic

## Importing Libraries

In [1]:
# Importing libraries for topic
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import numpy

In [2]:
# Importing Libraries for posts
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

## Getting all the posts

In [3]:
post_list = []

In [4]:
# Getting pinterest posts 
with open("pinterestData.txt","r") as file:
    dataList = file.readlines()
dataList = [item.strip() for item in dataList]
post_list.extend(dataList)

In [5]:
# Getting facebook posts
with open("facebookPost.txt","r", encoding="utf-8") as file:
    dataList = file.readlines()
dataList = [item.strip() for item in dataList]
post_list.extend(dataList)

In [6]:
# Getting reddit posts
with open("redditPost.txt","r", encoding="utf-8") as file:
    dataList = file.readlines()
dataList = [item.strip() for item in dataList]
post_list.extend(dataList)

In [7]:
# getting the keyword
with open("keyword.txt","r") as file:
    hashtag_to_search = file.read().strip()

## Getting the relevant Posts and Topic

In [10]:
# Getting the topic
list_keyword = []
def extract_topics(posts, num_topics = 5):
    vectorizer = TfidfVectorizer(max_df=0.85, max_features=1000, stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(posts)
    
    nmf = NMF(n_components=num_topics, random_state=42)
    nmf.fit(tfidf_matrix)
    
    feature_names = vectorizer.get_feature_names_out()
    for i,topic in enumerate(nmf.components_):
        top_words_idx = topic.argsort()[:-10-1:-1]
        top_words = [feature_names[idx] for idx in top_words_idx]
        list_keyword.append(" ".join(top_words[0:2]))
extract_topics(post_list)
list_keyword

['wallpaper kitty',
 'ideas tree',
 'merry decor',
 'paintings canvas',
 'gifts diy']

In [11]:
# Getting the most relevant posts
def get_bert_embeddings(posts):
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    if isinstance(posts, str):
        posts = [posts]
    
    encoded_posts = tokenizer(posts, return_tensors='pt', padding = True, truncation=True)

    with torch.no_grad():
        outputs = model(**encoded_posts)
        embeddings = outputs['last_hidden_state'][:,0, :].numpy()
    return embeddings

def compute_cosine_similarity(post_embeddings, topic_embedding):
    post_embeddings = post_embeddings.reshape(post_embeddings.shape[0],-1)
    topic_embedding = topic_embedding.reshape(1, -1)
    similarities = cosine_similarity(post_embeddings, topic_embedding)
    return similarities.flatten()

def find_relevance_to_topic(posts, topic, threshold = 0.5):
    post_embeddings = get_bert_embeddings(posts)
    topic_embedding = get_bert_embeddings(topic)
    similarities = compute_cosine_similarity(post_embeddings,topic_embedding)
    relevant_posts = [post for post, sim in zip(posts, similarities) if sim > threshold]
    return relevant_posts

relevant_posts = find_relevance_to_topic(post_list,hashtag_to_search)
relevant_posts

['christmas nails',
 'christmas wallpaper',
 'christmas tree ideas',
 'christmas cookies',
 'christmas tree',
 'christmas crafts',
 'christmas wallpaper aesthetic',
 'christmas gift ideas',
 'diy christmas ornaments',
 'christmas nails 2023',
 'christmas nail designs',
 'simple christmas nails',
 'christmas gifts',
 'coloring pages christmas',
 'christmas wallpaper iphone',
 'pink christmas wallpaper',
 'christmas paintings',
 'christmas house exterior',
 'christmas pfp',
 'diy christmas gifts',
 'christmas crafts for kids',
 'holiday decor christmas',
 'christmas hello kitty wallpaper',
 'christmas wallpaper laptop',
 'christmas wallpaper ipad',
 'hallway christmas decor',
 'christmas gifts for boyfriend',
 'christmas painting ideas',
 'christmas basket for boyfriend',
 'gingerbread themed christmas decor',
 'christmas computer wallpaper',
 'little women christmas',
 'christmas canvas paintings',
 'funny christmas wallpaper',
 'christmas basket gift ideas',
 'christmas rock painting i

In [15]:
# Taking Topmost 10 posts
if(len(relevant_posts) > 10):
    relevant_posts = relevant_posts[:10]

In [16]:
with open("post.txt","w", encoding="utf-8") as file:
    for post in relevant_posts:
        file.write(str(post) + "\n")

In [17]:
with open("postKeyword.txt","w", encoding="utf-8") as file:
    for post in list_keyword:
        file.write(str(post) + "\n")