## Importations de libraries

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

## Récuperations des données d'entrainement et de test

In [None]:
test_data = pd.read_csv("data/test.csv")
training_data = pd.read_csv("data/training.csv")
test_data.describe()

## Verification des types de données qu'on trouve sur training_data

In [None]:
# Check the columns
training_data.columns

# Drop missing values
# training_data = training_data.dropna(axis =0)

# Chosing prediction target
y = training_data.FraudResult

In [None]:
# As we chose fraud results as target, we drop it from the training data.
training_data.drop(['FraudResult'],axis = 1, inplace=True)

training_data.head()

In [None]:
# choosing Feautures
training_features = ['BatchId', 'AccountId','CustomerId','ProviderId','ProductId','ProductCategory','ChannelId','Value']

X = training_data[training_features]

X.head()

# Clean Data

In [None]:
id_cols = ['BatchId', 'AccountId','CustomerId','ProductId','ProviderId','ChannelId']
for col in id_cols:
    X[col] = X[col].apply(lambda x : x.split("_")[-1])

X.head()

# Building Model

In [None]:
train_X, val_X, train_y, val_y = train_test_split(X, y,random_state = 0)

def score_dataset(train_X, val_X, train_y, val_y):
    # Create random forest model
    training_forest_model = RandomForestRegressor(random_state=1)
    #fit the model
    training_forest_model.fit(train_X, train_y)
    #Make predictions
    train_preds = training_forest_model.predict(val_X)

    #Mean Absolute Error
    train_error = mean_absolute_error(val_y, train_preds)
    return train_error

# Imputation


In [None]:
cols_with_missing = [col for col in train_X.columns
                     if train_X[col].isnull().any()]

train_X_plus = train_X.copy()
val_X_plus = val_X.copy()

for col in cols_with_missing:
    train_X_plus[col + '_was_missing'] = train_X_plus[col].isnull()
    val_X_plus[col + '_was_missing'] = val_X_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_train_X_plus = pd.DataFrame(my_imputer.fit_transform(train_X_plus))
imputed_val_X_plus = pd.DataFrame(my_imputer.transform(val_X_plus))

# Imputation removed column names; put them back
imputed_train_X_plus.columns = train_X_plus.columns
imputed_val_X_plus.columns = val_X_plus.columns

score = score_dataset(imputed_train_X_plus, imputed_val_X_plus, train_y, val_y)
print("MAE from Approach 3 (An Extension to Imputation):")
print(score)

## Categorical Variables

In [None]:
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(train_X["ProductCategory"].values.reshape(-1,1)))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(val_X["ProductCategory"].values.reshape(-1,1)))

# One-hot encoding removed index; put it back
OH_cols_train.index = train_X.index
OH_cols_valid.index = val_X.index

# Remove categorical columns (will replace with one-hot encoding)
num_train_X = train_X.drop("ProductCategory", axis=1)
num_valid_X = val_X.drop("ProductCategory", axis=1)

# Add one-hot encoded columns to numerical features
OH_train_X = pd.concat([num_train_X, OH_cols_train], axis=1)
OH_valid_X = pd.concat([num_valid_X, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_train_X.columns = OH_train_X.columns.astype(str)
OH_valid_X.columns = OH_valid_X.columns.astype(str)

print("MAE from Approach 3 (One-Hot Encoding):") 
print(score_dataset(OH_train_X, OH_valid_X, train_y, val_y))
