---
# A. Model Inference
---

---
## A-1. Import Libraries

In [None]:
#Libraries
import pickle
import re
import string
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display

---
## A-2. NLTK Setup

In [None]:
# Setup NLTK
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Naufal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Naufal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

---
## A-3. Inferencing

In [None]:
# Load stopwords dan model
indo_stopwords = set(stopwords.words('indonesian'))
model_w2v = Word2Vec.load('word2vec_model.model')
df = pd.read_pickle('skincare_df_w2v.pkl')

# Fungsi preprocessing
def preprocess_text(text):
    if pd.isna(text):
        return text
    text = re.sub(r"\\n", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    keywords = ['ingredients', 'how to use', 'suitable']
    pattern = r'(?i)([a-zA-Z0-9])(' + '|'.join([k.replace(' ', '') for k in keywords]) + r')'
    text = re.sub(pattern, lambda m: f'{m.group(1)} {m.group(2)}', text)

    text = text.lower()
    text = re.sub(r'\bx\b', '', text)
    text = re.sub(r'\b\d+\b', '', text)
    text = re.sub(r'\b\d+[a-zA-Z]+\b', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))

    satuan_umum = {'ml', 'gram', 'menit', 'pump', 'pcs'}
    text = ' '.join([word for word in text.split() if word not in satuan_umum])

    for k in keywords:
        text = re.sub(k + r'\s*:?', k, text)

    text = ' '.join([word for word in text.split() if word not in indo_stopwords])
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Fungsi rata-rata Word2Vec
def get_avg_w2v(tokens, model):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

# Fungsi rekomendasi (format sama kayak training)
def recommended_products(query, df, model, alpha=0.7, beta=0.15, gamma=0.15, top_n=3):
    df_temp = df.copy()

    # Preprocess query
    query_clean = preprocess_text(query)
    query_tokens = query_clean.split()
    query_vec = get_avg_w2v(query_tokens, model).reshape(1, -1)
    X_vectors = np.vstack(df_temp['w2v_vector'].values)

    # Hitung cosine similarity
    cos_sim = cosine_similarity(query_vec, X_vectors)[0]
    df_temp['similarity'] = cos_sim

    # Normalisasi rating
    df_temp['rating_norm'] = (
        (df_temp['rating'] - df_temp['rating'].min()) /
        (df_temp['rating'].max() - df_temp['rating'].min())
    )

    # Normalisasi review count
    df_temp['review_norm'] = (
        (df_temp['review_count'] - df_temp['review_count'].min()) /
        (df_temp['review_count'].max() - df_temp['review_count'].min())
    )

    # Skor gabungan
    df_temp['final_score'] = (
        alpha * df_temp['similarity'] +
        beta * df_temp['rating_norm'] +
        gamma * df_temp['review_norm']
    )

    # Ambil top produk per kategori
    return (
        df_temp.sort_values(by='final_score', ascending=False)
        .groupby('product_type', group_keys=False)
        .head(top_n)
        .sort_values(by='product_type', ascending=True)
        .reset_index(drop=True)
        [['product_type', 'product', 'brand', 'price', 'rating', 'review_count',
          'description', 'image_url', 'similarity', 'final_score']]
    )

In [None]:
# Hasil rekomendasi
recommended_products("Banyak jerawat", df, model_w2v)

Unnamed: 0,product_type,product,brand,price,rating,review_count,description,image_url,similarity,final_score
0,cleanser,Gentle Skin Cleanser,Cetaphil,112000.0,4.2,15298,Cetaphil Gentle Skin Cleanser mengandung formu...,https://image.femaledaily.com/dyn/210/images/p...,0.544327,0.62245
1,cleanser,Acne Facial Foam,Nourish Beauty Care,30000.0,3.8,119,Nourish Beauty Care (NBC) acne facial foam mem...,https://image.femaledaily.com/dyn/210/images/p...,0.760471,0.601048
2,cleanser,Perfect Purifying Facial Foam,Mineral Botanica,20000.0,4.7,120,Mineral Botanica Perfect Purifying Facial Foam...,https://image.femaledaily.com/dyn/210/images/p...,0.654471,0.588221
3,moisturizer,Cerabeads Moisturising Cream,Cosmo Colony,150280.0,4.9,116,CO.CO Cosmo Colony Cerabeads Moisturising Crea...,https://s3.ap-southeast-1.amazonaws.com/assets...,0.680195,0.619826
4,moisturizer,Hydra Boost Oil Free Moisturizer,ElsheSkin,129000.0,4.7,243,ElsheSkin Hydra Boost Oil-Free Moisturizer mer...,https://image.femaledaily.com/dyn/210/images/p...,0.663392,0.595644
5,moisturizer,Elixskin Nutritive Cream V2.0,Aizen Dermalogy,139000.0,4.0,120,Reformulation in November 2022Aizen Elixskin N...,https://image.femaledaily.com/dyn/210/images/p...,0.730446,0.593677
6,serum/essence,Chromabright Brightening Serum,Everwhite,125000.0,4.8,1286,"Manfaat :Mencerahkan wajah,kulit bening,menyam...",https://image.femaledaily.com/dyn/210/images/p...,0.670959,0.617752
7,serum/essence,Acne Serum,Implora,35000.0,4.1,269,Implora Acne Serum membantu untuk meminimalkan...,https://image.femaledaily.com/dyn/210/images/p...,0.747661,0.613973
8,serum/essence,Bright Miracle Ultimate Clarity Serum,Pond's,108000.0,4.6,1483,POND's Bright Miracle Ultimate Clarity Serum m...,https://s3.ap-southeast-1.amazonaws.com/assets...,0.672443,0.607043
9,sun protection,Hydrasoothe Sunscreen Gel SPF 45+++,Azarine Cosmetic,54000.0,4.8,14814,DAILY SKIN PROTECTION FOR SUNSCREEN HATERSORGA...,https://image.femaledaily.com/dyn/210/images/p...,0.53939,0.655266
