# **1) Import librairies**

In [None]:
import sys
import time
import random
import concurrent.futures
import pandas as pd

from XGBClassifier_experience import XGBClassifier_experience

import mlflow

from sklearn.decomposition import FastICA, KernelPCA

# **2) Read data**

### **Connect to google drive**

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
train_path = "/content/gdrive/MyDrive/Projects_data/Use_Case/data_train_final.csv"
test_path = "/content/gdrive/MyDrive/Projects_data/Use_Case/data_test_final.csv"

### **Read data**

In [None]:
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

### **Show Top 5 Records**

### **-- Training --**

In [None]:
train_data.head()

Unnamed: 0,product_id,category_id,f0,f1,f2,f3,f4,f5,f6,f7,...,f118,f119,f120,f121,f122,f123,f124,f125,f126,f127
0,cdc53c8374034e8d41528375097a51c087390292,2081,-0.006559,-0.120659,-0.004383,0.007749,-0.037135,-0.039693,-0.000479,-0.009981,...,0.012074,-0.002464,-0.005398,0.002239,-0.010406,-0.008286,0.007256,0.003646,0.007139,-0.011185
1,7faed7269ff5e13ced9245d23e88cb17e8b4c91b,9820,0.101763,-0.018803,-0.027987,-0.043492,0.096025,0.016519,0.010036,-0.015817,...,-0.001446,-0.00249,-0.006619,-0.018201,-0.025073,-0.019518,0.002237,-0.003276,-0.000349,0.008102
2,c307ce173cac526708f00f509d5250af69e2adcf,15651,-0.040672,-0.078895,-0.051821,-0.011871,-0.020762,0.041655,-0.059276,-0.022324,...,0.005634,-0.000398,0.003627,0.019286,0.005602,-0.009435,0.003241,-0.014601,-0.01998,0.015251
3,49b52f955911fafc4ddc136debb974070568551d,3056,-0.10592,0.043751,0.004357,0.085637,0.011778,-0.019016,-0.083128,0.025109,...,-0.020314,0.001298,0.012358,-0.006352,0.000958,-0.006532,-0.007828,-0.016894,0.004076,0.001585
4,0999ed1b31922ffee4f82526c5069f70d231960e,6933,-0.008389,-0.028736,-0.005999,0.047041,0.011757,-0.037296,-0.033031,0.056991,...,-0.007202,0.006544,-0.006674,0.004217,-0.000845,0.004107,0.001433,-0.003491,-0.016566,0.019101


### **-- Testing --**

In [None]:
test_data.head()

Unnamed: 0,product_id,category_id,f0,f1,f2,f3,f4,f5,f6,f7,...,f118,f119,f120,f121,f122,f123,f124,f125,f126,f127
0,ff46a565a5f4b7e6798159d508b08186b9f1f86b,3076,-0.112457,-0.014667,0.045303,-0.060092,0.037064,-0.036514,-0.032242,0.010361,...,-0.008942,-0.015334,0.018582,-0.002193,0.014737,-0.004768,-0.013888,0.001604,0.00435,0.001738
1,28c72b330a43a23bdb84cb82d8fcdc5b3b50faaa,15723,-0.092445,-0.099975,-0.044086,-0.067426,0.000879,0.095024,-0.054753,-0.007339,...,0.003764,0.01181,-0.000829,-0.003894,-0.005821,-0.004236,-0.000549,-0.002208,0.001389,-0.011216
2,d7b9f39d5980edf43a0a519de487dd59fa54a8dc,7903,0.0599,-0.036213,-0.036249,0.001033,-0.01268,-0.02829,0.072718,-0.017314,...,-0.014133,0.005175,-0.015798,-0.008871,0.006316,-0.018845,-0.018156,-0.005962,-0.018345,-0.004431
3,b98194bccb443fc44fecaffdce3849d8a5c344a0,15102,0.149263,-0.127782,0.074531,0.026073,-0.086876,-0.012713,0.037158,-0.021367,...,-0.016751,0.001955,-0.0138,0.017624,0.000618,-0.003724,-0.002036,0.001842,-0.009183,0.007981
4,c45d8d2f78debb9ea56cacbfe49f03afc03be5cd,8931,0.050621,0.034014,-0.069493,0.000413,-0.019597,0.019977,-0.027061,0.016136,...,0.003482,-0.006199,-0.021119,0.009234,-0.012708,0.005795,0.006701,0.016876,0.008152,-0.000834


# **3) Dataset preparation**

In [None]:
train_data.drop("product_id", axis=1, inplace=True)
test_data.drop("product_id", axis=1, inplace=True)

### **Trasnfor data (represent "category_id" by values between 0-100)**

In [None]:
dic = {}
category_id_list = sorted(list(set(train_data["category_id"].values)))
for i in range(len(category_id_list)):
  dic[category_id_list[i]] = i

In [None]:
# trasformation for training data
train_data["category_id_target"] = train_data["category_id"]
train_data["category_id_target"].replace(dic, inplace=True)
# trasformation for testing data
test_data["category_id_target"] = test_data["category_id"]
test_data["category_id_target"].replace(dic, inplace=True)

In [None]:
X_train = train_data.drop(["category_id", "category_id_target"], axis=1)
y_train = train_data["category_id_target"]

X_test = test_data.drop(["category_id", "category_id_target"], axis=1)
y_test = test_data["category_id_target"]

### **Divide data into explanatory variables and target variables**

In [None]:
X_train = train_data.drop(["category_id", "category_id_target"], axis = 1)
Y_train = train_data["category_id_target"]

# **4) Dimensionality reduction**

### **4.1) Use KPCA (Kernel Principal component analysis)**

### **Define the reducer model**

In [None]:
# KernelPCA_reducer = KernelPCA(n_components=12, kernel='rbf')
# KernelPCA_reducer = KernelPCA_reducer.fit(X_train)

# X_train_reduced = KernelPCA_reducer.transform(X_train)

### **Save the reducer model**

In [None]:
# save_object(
#               file_path = ".\KernelPCA_reducer.pkl",
#               obj = KernelPCA_reducer
#             )

### **4.2) Use FastICA (Fast algorithm for Independent Component Analysis)**

### **Define the reducer model**

In [None]:
FastICA_reducer = FastICA(n_components=12, random_state=0, whiten='unit-variance', whiten_solver = "eigh")
FastICA_reducer_train = FastICA_reducer.fit(X_train)

X_reduced = FastICA_reducer_train.transform(X_train)

### **Save the reducer model**

In [None]:
# save_object(
#               file_path = ".\FastICA_reducer.pkl",
#               obj = FastICA_reducer
#             )

### **Reduce training data**

In [None]:
df_X_train_reduced = pd.DataFrame(X_reduced, columns=["Col_1", "Col_2", "Col_3", "Col_4", "Col_5", "Col_6", "Col_7", "Col_8", "Col_9", "Col_10", "Col_11", "Col_12"])
# Concatenate new dataframe, with reduced dimension (12 instead of 128)
Reduced_df = pd.concat([df_X_train_reduced, y_train], axis=1)
Reduced_df.to_csv("Train_Reducer_data_FastICA.csv", index=False)

### **Show training data reduced**

In [None]:
Reduced_df.head()

Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,Col_11,Col_12,category_id_target
0,0.814944,0.244146,-0.180644,-0.203664,0.360619,0.586401,0.015962,0.433798,0.402685,0.750185,0.83173,1.096302,9
1,1.189277,0.143912,-0.365877,0.785686,0.05564,-0.516515,0.754084,0.118488,0.621098,-1.549047,-0.047178,-0.09666,69
2,0.438398,-0.964076,-1.03071,0.999469,0.004909,0.142187,-2.241106,-0.178291,1.097998,0.554944,0.242809,0.722505,95
3,-2.666719,-0.411787,-0.895022,0.910835,0.561998,0.165517,-0.834354,0.172831,0.590912,0.070956,0.182319,0.19886,32
4,-0.775743,1.312365,0.802273,0.157242,-0.320334,0.657127,0.673763,-0.851504,1.30447,0.096695,0.926726,0.932637,53


### **Reduce testing data**

In [None]:
X_test_reduced = FastICA_reducer.transform(X_test)

df_X_test_reduced = pd.DataFrame(X_test_reduced, columns=["Col_1", "Col_2", "Col_3", "Col_4", "Col_5", "Col_6", "Col_7", "Col_8", "Col_9", "Col_10", "Col_11", "Col_12"])
# Concatenate new dataframe, with reduced dimension (12 instead of 128)
Reduced_df = pd.concat([df_X_test_reduced, y_test], axis=1)
Reduced_df.to_csv("Test_Reducer_data_FastICA.csv", index=False)

### **Show training data reduced**

In [None]:
Reduced_df.head()

Unnamed: 0,Col_1,Col_2,Col_3,Col_4,Col_5,Col_6,Col_7,Col_8,Col_9,Col_10,Col_11,Col_12,category_id_target
0,-0.270273,-0.01578,-0.762243,1.026619,0.951245,0.35801,0.580234,-0.498713,-0.998046,-0.027173,-0.235455,0.406666,34
1,0.439611,-1.784127,-1.427157,1.164137,0.455465,0.72347,0.163874,-1.010466,1.522438,0.386348,-0.166035,-0.650022,97
2,1.432367,0.78355,0.37654,-0.371456,-0.186629,-0.220172,0.202652,1.030547,-0.008097,0.264027,0.5749,-0.095532,61
3,0.365668,-0.431534,-0.231338,-2.596871,0.17538,0.344771,1.435319,-0.251593,0.941133,0.22312,2.01556,-0.126942,88
4,0.051997,0.025536,-0.714718,0.58297,-1.05434,-0.429479,0.966974,0.127578,1.611947,0.468002,0.221916,0.147689,66
