# I. Import

In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

In [2]:
# Serialization
import pickle
import json

In [3]:
import datetime as dt

In [4]:
# disable warnings
import warnings
warnings.filterwarnings('ignore') #turn off warning

---

# II. Load

In [5]:
# Load the trained model
with open('Streamlit\\clustering_model.pkl', 'rb') as f:
    clustering_model = pickle.load(f)

# Load the fitted StandardScaler
with open('Streamlit\\clustering_scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# Load the fitted pca
with open('Streamlit\\pca.pkl', 'rb') as f:
    pca = pickle.load(f)

# Load the data from the text file
with open('Streamlit\\clustering_columns.txt', 'r') as infile:
    data = json.load(infile)

# Extract the lists from the dictionary
num_col = data['num_col']
cat_col = data['cat_col']

In [6]:
data_inf = {'Unnamed: 0': 0,
        'id': 5524,
        'year_birth': 1957,
        'education': 'Graduation',
        'marital_status': 'Single',
        'income': 58138.0,
        'kidhome': 0,
        'teenhome': 0,
        'dt_customer': '04-09-2012',
        'recency': 58,
        'mnt_wines': 635,
        'mnt_fruits': 88,
        'mnt_meat_products': 546,
        'mnt_fish_products': 172,
        'mnt_sweet_products': 88,
        'mnt_gold_prods': 88,
        'num_deals_purchases': 3,
        'num_web_purchases': 8,
        'num_catalog_purchases': 10,
        'num_store_purchases': 4,
        'num_web_visits_month': 7,
        'accepted_cmp3': 0,
        'accepted_cmp4': 0,
        'accepted_cmp5': 0,
        'accepted_cmp1': 0,
        'accepted_cmp2': 0,
        'complain': 0,
        'response': 0}

data_inf = pd.DataFrame([data_inf])

---

# III. Feature Engineering

## 1. Feature Creation

In [7]:
# Create new features from amount features
data_inf["total_mnt"] = data_inf["mnt_wines"] + data_inf["mnt_fruits"] + data_inf["mnt_meat_products"] + data_inf["mnt_fish_products"] + data_inf["mnt_sweet_products"] + data_inf["mnt_gold_prods"]

In [8]:
# Filter out rows where year_birth is less than 1928
data_inf = data_inf[data_inf["year_birth"] >= 1928]

# Define generation labels and ranges
generations = {
    "Silent Generation": (1928, 1945),
    "Baby Boomers": (1946, 1964),
    "Generation X": (1965, 1980),
    "Millennials": (1981, 1996)
}

# Create a function to assign generation label
def assign_generation(year):
    for gen, (start, end) in generations.items():
        if start <= year <= end:
            return gen

# Apply the function to the year_birth feature
data_inf["generation"] = data_inf["year_birth"].apply(assign_generation)

In [9]:
data_inf["dt_customer"] = pd.to_datetime(data_inf["dt_customer"], format="%d-%m-%Y")

# Create new features from date features
data_inf["customer_since"] = (dt.datetime(2015, 1, 1) - data_inf["dt_customer"]).dt.days

In [10]:
# Drop unnecessary columns
data_inf = data_inf.drop(["Unnamed: 0", "id", "dt_customer", "response"], axis=1) ## these columns won't help the model

## 2. Split Data

In [11]:
data_num = data_inf[num_col]
data_cat = data_inf[cat_col]

---

# IV. Encode and Scale

In [12]:
def encoder(df):
    # Define the mappings for each variable
    education_mapping = {'PhD': 0, 'Basic': 1, 'Graduation': 2, 'Master': 3, '2n Cycle': 4}
    marital_status_mapping = {'Together': 0, 'Married': 1, 'Single': 2, 'Divorced': 3, 'Widow': 4, 'Alone': 5, 'YOLO': 6, 'Absurd': 7}
    binary_mapping = {0: 0, 1: 1}
    generation_mapping = {'Silent Generation':0, 'Baby Boomers': 1, 'Millennials': 2, 'Generation X': 3}

    # Apply the mappings to the DataFrame
    df['education'] = df['education'].map(education_mapping)
    df['marital_status'] = df['marital_status'].map(marital_status_mapping)
    df['accepted_cmp1'] = df['accepted_cmp1'].map(binary_mapping)
    df['accepted_cmp2'] = df['accepted_cmp2'].map(binary_mapping)
    df['accepted_cmp3'] = df['accepted_cmp3'].map(binary_mapping)
    df['accepted_cmp4'] = df['accepted_cmp4'].map(binary_mapping)
    df['accepted_cmp5'] = df['accepted_cmp5'].map(binary_mapping)
    df['complain'] = df['complain'].map(binary_mapping)
    df['generation'] = df['generation'].map(generation_mapping)

    return df

# Apply the function to the training and test data
data_cat_encoded = encoder(data_cat)

In [13]:
data_num_scaled = scaler.transform(data_num)

In [14]:
data_num_scaled_df = pd.DataFrame(data_num_scaled, columns=num_col)
data_reduced = pca.transform(data_num_scaled_df)

In [1]:
data_final = np.concatenate([data_reduced, data_cat_encoded], axis=1)

NameError: name 'np' is not defined

---

# V. Inference

In [16]:
data_final_pred = clustering_model.predict(data_final, categorical=[13, 14, 15, 16, 17, 18, 19, 20, 21])
data_final_pred

array([2], dtype=uint16)