In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from avp_pckg.DataFrame import AvPdataFrame 
from avp_pckg.avp_model_selection import cross_validate_pipe
from avp_pckg.avp_model_selection import plot_scores, wheels_type_split, print_scores
from avp_pckg.avp_model_selection import PrepareColsBase, PrepareColsTEncoder

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import classification_report, f1_score # accuracy_score, recall_score, precision_score

%load_ext autoreload
%autoreload 2


# Construction of a base models with whole dataset
- decision tree 
- random forest
- logistic regression

## 1. Load data as train and test sets.
Data types of the coulumns needs to be checked. In current dataset 'WheelTypeID' column have a mixed datatype and should be handled separatrly 


not solved: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas.

In [None]:
features_train = pd.read_csv('data\\features_train.csv', parse_dates=['PurchDate'], index_col=0)
features_test = pd.read_csv('data\\features_test.csv', parse_dates=['PurchDate'], index_col=0)

target_train = pd.read_csv('data\\target_train.csv', index_col=0)
target_test = pd.read_csv('data\\target_test.csv', index_col=0)

# features_train.loc[:, 'WheelTypeID'] = features_train['WheelTypeID'].astype(str)
features_train.loc[:, 'WheelTypeID'] = features_train['WheelTypeID'].astype(float)
features_train.loc[:, 'WheelTypeID'] = features_train['WheelTypeID'].astype(str)
print(features_train.shape, target_train.shape)
features_train.columns

### 2.1 Select categorical and numerical columns
For the base model all columns are used (exept 'PurchDate': droped in pipeline) 

In [None]:
cols_cat = ['Auction', 'VehicleAge', 'Make', 'Model', 'Trim', 'SubModel', 'WheelType', 'BYRNO', 'VNZIP1', # cols to use
            'Nationality', 'IsOnlineSale', 'Transmission', 'Color', 'TopThreeAmericanName', 'PRIMEUNIT', 'AUCGUART', 'Size', 'VNST', 'VehYear', 'WheelTypeID'] # cols to drop

cols_num = ['MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
            'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
            'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
            'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 
            'VehOdo', 'VehBCost', 'WarrantyCost']

## 2. Tree-model with whole dataset
cross-validate 

In [None]:
# execution time= 34 s 
param_name ='max_depth'
param_range = [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18,]
score_dict = cross_validate_pipe(X=features_train,
                                y=target_train,
                                cols_cat=cols_cat,
                                cols_num=cols_num,
                                param_name=param_name,
                                param_range= param_range,
                                cv=5, 
                                max_cat=25,
                                estimator_name='tree',
                                n_jobs=-1,
                                )

plot_scores(score_dict, param_name='max_depth tree')
print_scores(score_dict)


## 2. Random Forest-model with whole dataset
cross-validate 

In [None]:
# execution time= 2m 12s 
param_name ='max_depth'
param_range = [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18,]
score_dict = cross_validate_pipe(X=features_train,
                                y=target_train,
                                cols_cat=cols_cat,
                                cols_num=cols_num,
                                param_name=param_name,
                                param_range= param_range,
                                cv=5, 
                                max_cat=25,
                                estimator_name='forest', # 'forest', 'logistic', 'tree',
                                n_jobs=-1,
                                )

plot_scores(score_dict, param_name=param_name)
print_scores(score_dict)

## 2. Logistic Regresion - model with whole dataset
cross-validate 

In [None]:
param_name ='C'
param_range = [0.001, 0.002, 0.004, 0.01, 0.02, 0.04, 0.1, 1, 10,]
score_dict = cross_validate_pipe(X=features_train,
                                y=target_train,
                                cols_cat=cols_cat,
                                cols_num=cols_num,
                                param_name=param_name,
                                param_range= param_range,
                                cv=5, 
                                max_cat=25,
                                estimator_name='logistic', # 'forest', 'logistic', 'tree',
                                n_jobs=-1,
                                )

plot_scores(score_dict, param_name=param_name, xlog=True)
print_scores(score_dict)

Cross-Validation
| model | parameter | f1_cv | precision  | recall | f1-score | support |
|---|---|---|---|---|---|---|
| Tree | depth=4 | 0.376 |- |- | - | - |
| Forest | depth=10 |0.382 | - | - |- | - |
| LogReg | C=0.01 | 0.375 | - | - |- | - |


# Predictoins

In [None]:

### Tree model #########################################################
pipe_tree = Pipeline(steps=[
('base', PrepareColsBase(cols_cat=cols_cat, cols_num=cols_num, max_cat=25).make_pipe()),
('model', DecisionTreeClassifier(class_weight='balanced', random_state=42, max_depth=4))
])

pipe_tree.fit(features_train, target_train['IsBadBuy'].to_numpy())

pred_tree = pipe_tree.predict(features_test)
df_pred_tree = pd.DataFrame(pred_tree, index=features_test.index)
df_pred_tree.columns = ['tree']

report = classification_report(target_test, pred_tree)
print('tree report: \n', report)
print('tree pred.sum():', pred_tree.sum())



In [None]:

### Forest model #########################################################
pipe_forest = Pipeline(steps=[
('base', PrepareColsBase(cols_cat=cols_cat, cols_num=cols_num, max_cat=25).make_pipe()),
('model', RandomForestClassifier(class_weight='balanced', random_state=42, max_depth=10))
])

pipe_forest.fit(features_train, target_train['IsBadBuy'].to_numpy())
pred_forest = pipe_forest.predict(features_test)
df_pred_forest = pd.DataFrame(pred_forest, index=features_test.index)
df_pred_forest.columns = ['forest']


report = classification_report(target_test, pred_forest)
print('forest report: \n', report)
print('forest pred.sum():', pred_forest.sum())


In [None]:
pipe_reg = Pipeline(steps=[
('preprocessing', PrepareColsBase(cols_cat=cols_cat, cols_num=cols_num, max_cat=25).make_pipe()),
('model', LogisticRegression(class_weight='balanced', random_state=42, C=0.01))
])

pipe_reg.fit(features_train, target_train['IsBadBuy'].to_numpy())
pred_reg = pipe_reg.predict(features_test)
df_pred_reg = pd.DataFrame(pred_reg, index=features_test.index)
df_pred_reg.columns = ['logistic']

report = classification_report(target_test, pred_reg)
print('regression report: \n', report)
print('logistig pred.sum():', pred_reg.sum())



Cross-Validation
| model | parameter | f1_cv | precision  | recall | f1-score | pred.sum() |
|---|---|---|---|---|---|---|
| Tree | depth=4 | 0.376 |0.30 |0.49  |0.37 | 2162 |
| Forest | depth=10 |0.382 | 0.24 |  0.60 |0.35 | 3143 |
| LogReg | C=0.01 | 0.375 | 0.19  | 0.80  |0.30 | 5548 |



## Ensamble

In [None]:
df_pred = pd.concat([df_pred_tree, df_pred_forest, df_pred_reg], axis=1)
df_pred.loc[:, 'sum'] = df_pred['tree'] + df_pred['forest'] + df_pred['logistic']
df_pred.loc[:, 'result1'] = round((df_pred['sum']+1)/3).astype(int)
df_pred.loc[:, 'result2'] = round((df_pred['sum']+0)/3).astype(int)
df_pred.loc[:, 'result3'] = round((df_pred['sum']-1)/3).astype(int)

display(df_pred.head())
print(df_pred.sum())

In [None]:
report = classification_report(target_test, df_pred['result3'])
print('result3 report: \n', report)

report = classification_report(target_test, df_pred['result2'])
print('result2: \n', report)

report = classification_report(target_test, df_pred['result1'])
print('result1: \n', report)

Cross-Validation
| model | parameter | f1_cv | precision  | recall | f1-score | pred.sum() |
|---|---|---|---|---|---|---|
| Tree | depth=4 | 0.376 |0.30 |0.49  |0.37 | 2162 |
| Forest | depth=10 |0.382 | 0.24 |  0.60 |0.35 | 3143 |
| LogReg | C=0.01 | 0.375 | 0.19  | 0.80  |0.30 | 5548 |
|Ensamble prec| - |- |0.32|0.48|0.38|1909|
|Ensamble recall| - |- |0.18|0.81|0.30|5752|

Cross-Validation original data set
| model | parameter | f1_cv | precision  | recall | f1-score | pred.sum() |
|---|---|---|---|---|---|---|
| Tree | depth=4 | 0.376 | 0.30 | 0.49  | 0.37 | 2162 |
| Forest | depth=10 |0.382 | 0.24 |  0.60 | 0.35 | 3143 |
| LogReg | C=0.01 | 0.375 | 0.19  | 0.80  | 0.30 | 5548 |
|Ensamble| - |- |0.32 | 0.48 | 0.38 | 1909 |