# Packages

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Content = Business's Description

In [2]:
df = pd.read_csv("data/sponsors.csv")
df

Unnamed: 0,name,category,sub-category,about,metadata
0,Asus Indonesia,Sponsor,Technology Hardware,ASUS is passionate about technology and driven...,Sponsor Technology Hardware ASUS is passionate...
1,Hydro Coco,Sponsor,Food & Beverage,Tentang Hydro Coco\r\nHydro Coco terbuat dari ...,Sponsor Food & Beverage Tentang Hydro Coco\r\n...
2,Nutrijell,Sponsor,Food & Beverage,Nutrijell is a leading agar-agar brand in Indo...,Sponsor Food & Beverage Nutrijell is a leading...
3,Zalora Indonesia,Sponsor,E-Commerce,"Zalora Indonesia, the largest online fashion r...","Sponsor E-Commerce Zalora Indonesia, the large..."
4,Marina,Sponsor,Personal & Beauty,Marina Natural is a leading Indonesian cosmeti...,Sponsor Personal & Beauty Marina Natural is a ...
...,...,...,...,...,...
857,photobooth wedding murah jabodetabek,Equipment Rental,Photobooth,photobooth wedding murah jabodetabek,Equipment Rental Photobooth photobooth wedding...
858,photo booth ulang tahun murah,Equipment Rental,Photobooth,photo booth ulang tahun murah,Equipment Rental Photobooth photo booth ulang ...
859,MobileTrans - Wa Transfer dan Line Transfer Or...,Equipment Rental,Photobooth,MobileTrans - Wa Transfer dan Line Transfer Or...,Equipment Rental Photobooth MobileTrans - Wa T...
860,SEWA WIFI JEPANG | JAPAN WIFI | SEWA JAPAN WIFI,Equipment Rental,Photobooth,SEWA WIFI JEPANG | JAPAN WIFI | SEWA JAPAN WIFI,Equipment Rental Photobooth SEWA WIFI JEPANG |...


# Encode All About to a Bank

In [None]:
nltk.download('punkt')

In [None]:
bow = CountVectorizer(stop_words="english", min_df=1, tokenizer=word_tokenize)
bank = bow.fit_transform(df.metadata)

## Step 1: Encode What User Click

In [None]:
idx = 52

In [None]:
content = df.loc[idx, "metadata"]
content

In [None]:
code = bow.transform([content])
code.toarray()

## Step 2: Document Search

In [None]:
dist = cosine_distances(code, bank)
dist

In [None]:
rec_idx = dist.argsort()[0, 1:11]
rec_idx

## Step 3: Recommend

In [None]:
df.loc[rec_idx]

# ML Engineering: Sum Them All Up

In [37]:
class RecommenderSystem:
    def __init__(self, data, content_col):
        self.df = pd.read_csv(data)
        self.content_col = content_col
        self.encoder = None
        self.bank = None
        self.test_set = None

    def split_data(self, test_size=0.2, random_state=42):
        train_data, test_data = train_test_split(self.df, test_size=test_size, random_state=random_state)
        self.train_data = train_data.reset_index(drop=True)
        self.test_data = test_data.reset_index(drop=True)
        self.test_set = test_data.index

    def fit(self):
        self.encoder = CountVectorizer(stop_words="english", tokenizer=word_tokenize)
        self.bank = self.encoder.fit_transform(self.train_data[self.content_col])

    def recommend(self, idx, topk=10):
        content = self.df.loc[idx, self.content_col]
        code = self.encoder.transform([content])
        dist = cosine_distances(code, self.bank)
        rec_idx = dist.argsort()[0, 1:(topk+1)]
        return self.df.loc[rec_idx]

    def evaluate(self):
        if self.test_set is None:
            print("Split the data first using split_data method.")
            return
    
        predictions = []
        for idx in self.test_set:
            recommendations = self.recommend(idx)
            predictions.append(recommendations.index.tolist())
    
        true_labels = [idx for _ in self.test_set for idx in predictions[0]]
        flat_predictions = [item for sublist in predictions for item in sublist]
        accuracy = accuracy_score(true_labels, flat_predictions)
        print(f"Accuracy: {accuracy * 100:.2f}%")

In [38]:
recsys = RecommenderSystem("data/sponsors.csv", content_col="metadata")

In [39]:
recsys.split_data(test_size=0.2, random_state=42)

In [40]:
recsys.fit()

In [41]:
recsys.evaluate()

Accuracy: 0.81%
