# Preprocessing

### Imports and global params

In [1]:
import warnings
import numpy as np
import pandas as pd

In [2]:
SEED = 42
warnings.filterwarnings("ignore")
pd.set_option("display.precision", 3)
np.random.seed(SEED)

### Read data

In [3]:
X_train = pd.read_csv("train.csv", delimiter=";")
X_test = pd.read_csv("test.csv", delimiter=";")

### Drop redundant and empty cols

In [4]:
X_train = X_train.drop(
    [
        "aggregated_family",
        "category",
        "heel_shape_type",
        "toecap_type",
        "color_name",
        "phase_out",
        "year",
        "weekly_sales",
        "Production",
    ],
    axis=1,
)

X_test = X_test.drop(
    [
        "aggregated_family",
        "category",
        "heel_shape_type",
        "toecap_type",
        "color_name",
        "phase_out",
        "Unnamed: 28",
        "Unnamed: 29",
        "Unnamed: 30",
        "Unnamed: 31",
        "Unnamed: 32",
    ],
    axis=1,
)

### Fill nans

In [5]:
columns_with_nans = [
    "length_type",
    "silhouette_type",
    "waist_type",
    "neck_lapel_type",
    "sleeve_length_type",
    "woven_structure",
    "knit_structure",
    "print_type",
    "archetype",
]

X_train[columns_with_nans] = X_train[columns_with_nans].fillna("Undefined")
X_test[columns_with_nans] = X_test[columns_with_nans].fillna("Undefined")

### Group entries of same products and obtain total demand

In [6]:
cols = [
    "ID",
    "id_season",
    "family",
    "fabric",
    "color_rgb",
    "image_embedding",
    "length_type",
    "silhouette_type",
    "waist_type",
    "neck_lapel_type",
    "sleeve_length_type",
    "woven_structure",
    "knit_structure",
    "print_type",
    "archetype",
    "moment",
    "phase_in",
    "life_cycle_length",
    "num_stores",
    "num_sizes",
    "has_plus_sizes",
    "price",
]

X_train = X_train.groupby(by=cols, as_index=False)["weekly_demand"].sum()

### Turn embedding to vector

In [7]:
def string_to_array(a: str) -> np.array:
    return np.array(list(map(float, a.split(","))))

X_train["image_embedding"] = X_train["image_embedding"].map(string_to_array)
X_test["image_embedding"] = X_test["image_embedding"].map(string_to_array)

### Calculate cannibalism index

In [8]:
def image_distance(a: np.array, b: np.array) -> float:
    dif = np.subtract(a, b)
    return 1 / np.sqrt(np.sum(np.multiply(dif, dif)))


def family_cannibalism(a, family: list[pd.Series]) -> float:
    return sum(
        map(
            lambda other: image_distance(
                a["image_embedding"],
                other["image_embedding"],
            ),
            family,
        )
    )


def cannibal(dataset: pd.DataFrame) -> pd.DataFrame:
    families: dict[(int, str), list[(int, pd.Series)]] = {}
    cannibalisms: np.array = np.zeros(dataset.shape[0])
    for id, row in dataset.iterrows():
        if (row["id_season"], row["family"]) in families.keys():
            families[(row["id_season"], row["family"])].append((id, row))
        else:
            families[(row["id_season"], row["family"])] = [(id, row)]

    for family in families.values():
        for id, row in family:
            lesser_family = [x[1] for x in family if (x[1]["ID"] != row["ID"])]
            cannibalisms[id] = family_cannibalism(row, lesser_family)
    dataset["cannibalism"] = cannibalisms
    return dataset.drop(["image_embedding"], axis=1)


X_train = cannibal(X_train)
X_test = cannibal(X_test)

### Fuse datasets to onehot ALL categories (and other joint ops)

In [9]:
TALL = len(X_train)
X_total = pd.concat([X_train, X_test])

In [10]:
def split_rgb(df: pd.DataFrame) -> pd.DataFrame:
    df[["color_r", "color_g", "color_b"]] = df["color_rgb"].str.split(
        pat=",", n=2, expand=True
    )
    df = df.astype({"color_r": int, "color_g": int, "color_b": int})
    return df.drop(["color_rgb"], axis=1)

In [11]:
categorical_columns = [
    "family",
    "fabric",
    "length_type",
    "silhouette_type",
    "waist_type",
    "neck_lapel_type",
    "sleeve_length_type",
    "woven_structure",
    "knit_structure",
    "print_type",
    "archetype",
    "moment",
]
# Total 165 categories -> es gestionable

def one_hot_encoding(df: pd.DataFrame) -> pd.DataFrame:
    return pd.get_dummies(df, drop_first=True, columns=categorical_columns)

In [12]:
X_total = one_hot_encoding(X_total)
X_total = split_rgb(X_total)

def year(s): return int(s[6:6+4])
def month(s): return int(s[3:3+2])
def day(s): return int(s[0:0+2])

X_total["year"] = X_total["phase_in"].map(year)
X_total["month"] = X_total["phase_in"].map(month)
X_total["day"] = X_total["phase_in"].map(day)

X_total = X_total.drop(["phase_in"], axis=1)

### Apply log on prices to normalize

In [13]:
X_total["price"] = np.log(X_total["price"])

### Split again, drop target on test and save

In [14]:
X_train = X_total.iloc[:TALL, :]
X_test = X_total.iloc[TALL:, :]

In [15]:
X_test = X_test.drop(["weekly_demand"], axis=1)

X_train.to_csv("train_clean.csv", index=False, sep=";")
X_test.to_csv("test_clean.csv", index=False, sep=";")