In [3]:
import pandas as pd
import torch
import random
import numpy as np
from transformers import BertTokenizer, BertModel
from sklearn.preprocessing import MultiLabelBinarizer
import joblib

In [4]:
df = pd.read_csv("output.csv")
df = df.drop(columns=["Publish"])
df = df.drop(columns=["Publisher"])
category_counts = df["Category"].value_counts()
df["Category_Count"] = df["Category"].map(category_counts)
df.loc[df["Category_Count"] == 1, "Category"] = np.nan
df["Category"] = df["Category"].fillna("")
df["Category_list"] = df["Category"].str.split(" , ").apply(lambda x: [s.strip() for s in x])
mlb = MultiLabelBinarizer()
mlb.fit(df["Category_list"])
joblib.dump(mlb, "mlb.pkl")

['mlb.pkl']

In [None]:

class recSysModel:
    def __init__(self, path):
        self.dataset_df = pd.read_csv(path)
        self.mlb = joblib.load("mlb.pkl")
        category_counts = self.dataset_df["Category"].value_counts()
        self.dataset_df["Category_Count"] = self.dataset_df["Category"].map(category_counts)
        self.dataset_df.loc[self.dataset_df["Category_Count"] == 1, "Category"] = np.nan
        self.dataset_df["Category"] = self.dataset_df["Category"].fillna("")
        self.dataset_df['Category_list'] = self.dataset_df['Category'].str.split(' , ').apply(lambda arr: [s.strip() for s in arr])
        encoded_categories = self.mlb.transform(self.dataset_df['Category_list'])
        encoded_df = pd.DataFrame(encoded_categories, columns=self.mlb.classes_)
        self.dataset_df = pd.concat([self.dataset_df, encoded_df], axis=1)
        random_seed = 42
        random.seed(random_seed)
        torch.manual_seed(random_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(random_seed)

        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased').to(self.device)
        self.dataset_df = self.dataset_df.drop(columns=['Authors', 'Category', 'Category_list', 'Category_Count'], errors='ignore')
        self.titles = self.dataset_df["Title"].to_list()

    def __create_embedding(self, text):
        encoding = self.tokenizer.batch_encode_plus(
            [text],
            padding=True, 
            truncation=True, 
            return_tensors='pt', 
            add_special_tokens=True
        )
        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            outputs = self.model(input_ids, attention_mask=attention_mask)
            word_embeddings = outputs.last_hidden_state  

        sentence_embedding = word_embeddings.mean(dim=1) 
        return sentence_embedding.cpu()

    def train(self):
        numerical_cols = self.dataset_df.select_dtypes(include=np.number).columns
        processed_data = []
        grouped = self.dataset_df.groupby("Title")
        total_groups = len(grouped)
        for i, (title, group) in enumerate(grouped, 1):
            descriptions = group['Description'].tolist()
            embeddings = torch.stack([self.__create_embedding(desc).reshape(768) for desc in descriptions])
            mean_embedding = embeddings.mean(dim=0).numpy()
            summed_categories = group[numerical_cols].max().values
            combined_vector = np.concatenate((mean_embedding, summed_categories))
            processed_data.append(np.append(title, combined_vector))
            print(f'Processing {i}/{total_groups} ({(i/total_groups)*100:.2f}%)', end='\r')
        self.embeddings_df = np.array(processed_data, dtype=object)
        return self.embeddings_df

In [6]:
df_duplicates = df[df.duplicated(subset=["Title"], keep=False)]
df_duplicates_sorted = df_duplicates.sort_values(by="Title")
model = recSysModel("output_duplicates.csv")
embeddings = model.train()
np.save("duplicate_embeddings_encoded.npy", embeddings)


Processing 9338/9338 (100.00%)