## Solution for Lab 3

Data Mining course <br>
Student: Danis Alukaev <br>
Email: d.alukaev@innopolis.university <br> 
Group: B19-DS-01 <br>

### 0. Prerequisites

In [1]:
import pandas as pd
import numpy as np
import itertools

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import balanced_accuracy_score

import warnings
warnings.filterwarnings("ignore")

### 1. Data Preparation

In [2]:
def load_data(sample=None):
    df = pd.read_csv("./data/completed_orders.csv")
    if sample:
        size = int(len(df) * sample)
        df = df.sample(n=size)
    return df

In [3]:
def get_map():
    data = pd.read_csv("./data/completed_orders.csv")
    df = data.copy()[["StockCode", "Description"]]
    df.drop_duplicates(inplace=True)
    df = df.set_index("StockCode")
    return df

In [4]:
def get_all_combinations(df, sample=None):
    all_stockcodes, all_customers = set(df["StockCode"].to_numpy()), set(df["CustomerID"].to_numpy())
    all_combinations = np.array(list(itertools.product(all_customers, all_stockcodes)))
    all_combinations = pd.DataFrame(data=all_combinations, columns=["CustomerID", "StockCode"])
    if sample:
        size = int(len(all_combinations) * sample)
        all_combinations = all_combinations.sample(n=size)
    return all_combinations

In [5]:
def split_data(df, all_combinations):
    train, test = train_test_split(df, test_size=0.2, shuffle=False)
    train["Reorder"], test["Reorder"] = 1, 1
    train = pd.merge(train, all_combinations, how="outer", on=["CustomerID", "StockCode"]).fillna(0)
    test = pd.merge(test, all_combinations, how="outer", on=["CustomerID", "StockCode"]).fillna(0)
    train.drop_duplicates(inplace=True)
    test.drop_duplicates(inplace=True)
    train = train.sample(frac=1.0)
    test = test.sample(frac=1.0)
    return train, test

In [6]:
df = load_data(sample=0.1)
data = df.copy()[["CustomerID", "StockCode", "Year", "Month", "Day", "Hour", "Holiday", "Country"]]
all_combinations = get_all_combinations(data, sample=0.05)
train, test = split_data(data, all_combinations)

### 2. PCA

In [7]:
def encode_data(df):
    X, y = df[["CustomerID", "StockCode"]], df["Reorder"]
    all_stockcodes, all_customers = set(df["StockCode"].to_numpy()), set(df["CustomerID"].to_numpy())
    
    encoder_customer = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder_stockcodes = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    encoder_customer.fit(np.array(list(all_customers))[..., np.newaxis])
    encoder_stockcodes.fit(np.array(list(all_stockcodes))[..., np.newaxis])
    
    customer_encoded = pd.DataFrame(encoder_customer.transform(X["CustomerID"][..., np.newaxis]))
    customer_encoded.rename(columns={i: f"c{i}" for i in range(len(encoder_customer.categories_[0]))}, inplace=True)
    
    stockcodes_encoded = pd.DataFrame(encoder_stockcodes.transform(X["StockCode"][..., np.newaxis]))
    stockcodes_encoded.rename(columns={i: f"s{i}" for i in range(len(encoder_stockcodes.categories_[0]))}, inplace=True)
    
    X = pd.concat([customer_encoded, stockcodes_encoded], axis=1)
    
    del stockcodes_encoded
    del customer_encoded
    
    return X, y

In [8]:
X_train, y_train = encode_data(train)

In [9]:
X_test, y_test = encode_data(test)

In [10]:
pca = PCA(n_components=6)

X_train = pca.fit_transform(X_train)
X_test = pca.fit_transform(X_test)

In [11]:
pca.explained_variance_

array([0.00260294, 0.00245297, 0.00224153, 0.00222006, 0.0021025 ,
       0.00207038])

### 3. Building Model

In [12]:
model = CatBoostClassifier(iterations=7000,
                           task_type="GPU",
                           learning_rate=0.03,
                           depth=5,
                           devices='0',
                           eval_metric='AUC')

In [None]:
model.fit(X_train,
          y_train,
          verbose=True)

In [14]:
y_pred = model.predict(X_test)

In [15]:
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
result = f"Balanced accuracy: {balanced_accuracy:.2f}"
print(result)

Balanced accuracy: 0.86


### 4. Inference

In [16]:
def predict_cart(customer_id):
    all_stockcodes, all_customers = set(df["StockCode"].to_numpy()), set(df["CustomerID"].to_numpy())
    X = np.array(list(itertools.product([customer_id], all_stockcodes)))
    X = pd.DataFrame(data=X, columns=["CustomerID", "StockCode"])
    
    encoder_customer = OneHotEncoder(handle_unknown='ignore', sparse=False)
    encoder_stockcodes = OneHotEncoder(handle_unknown='ignore', sparse=False)
    
    encoder_customer.fit(np.array(list(all_customers))[..., np.newaxis])
    encoder_stockcodes.fit(np.array(list(all_stockcodes))[..., np.newaxis])
    
    customer_encoded = pd.DataFrame(encoder_customer.transform(X["CustomerID"][..., np.newaxis]))
    customer_encoded.rename(columns={i: f"c{i}" for i in range(len(encoder_customer.categories_[0]))}, inplace=True)
    
    stockcodes_encoded = pd.DataFrame(encoder_stockcodes.transform(X["StockCode"][..., np.newaxis]))
    stockcodes_encoded.rename(columns={i: f"s{i}" for i in range(len(encoder_stockcodes.categories_[0]))}, inplace=True)
    
    X = pd.concat([customer_encoded, stockcodes_encoded], axis=1)
    
    cart_ = model.predict(X)
    
    mapping = get_map()
    
    indices = np.nonzero(cart_)[0]
    
    cart = []
    for idx in indices:
        index = encoder_stockcodes.categories_[0][idx]
        product = mapping.iloc[mapping.index.get_loc(index)]
        if len(product) > 1:
            product = product.iloc[0]

        cart.append(product.Description)
    
    return cart 

In [17]:
predict_cart(15819)

['vintage bead pink evening bag',
 'blue daisy mobile',
 'charlie & lola wastepaper bin flora',
 '6 rocket balloons ',
 'party pizza dish red retrospot',
 'food cover with beads set 2 ']