In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from avp_pckg.DataFrame import AvPdataFrame 
from avp_pckg.avp_model_selection import cross_validate_pipe
from avp_pckg.avp_model_selection import plot_scores, print_scores, wheels_type_split
from avp_pckg.IsBadBuy_functions import load_features, calc_price_diff
from avp_pckg.avp_model_selection import PrepareColsBase, PrepareColsTEncoder

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import classification_report, f1_score # accuracy_score, recall_score, precision_score

%load_ext autoreload
%autoreload 2


# Construction of a base models. Strongly correlated prise columns changed on its differences.  
- tree
- random forest
- logistic regression

## 1. Load data as train and test sets.

In [None]:
cols_cat = [ 'Auction', 'VehicleAge',  'WheelType',
           'BYRNO', 'VNZIP1', # info byer
           'Make', 'Model', 'Trim', 'SubModel', # info model
           'Color',  'PRIMEUNIT', 'AUCGUART', 'Size', #  info model, low information
            ##
           'IsOnlineSale', 'Transmission', # low information
           'Nationality', 'TopThreeAmericanName', # redundant information
          'VNST', 'VehYear', 'WheelTypeID' # redundant information
            ] 

cols_num = [
            'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
            'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice',
            'MMRCurrentAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice',
            'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 
            'VehOdo', 'VehBCost', 'WarrantyCost']

In [None]:
### some feature columns remooved during loading and NaN are fiild. 
fname = 'data\\features_train.csv'
X_train = load_features(fname=fname, cols_cat=cols_cat, cols_num=cols_num)
X_train = calc_price_diff(X_train) ### prise columns changed on price difference columns
tname = 'data\\target_train.csv'
y_train = features = pd.read_csv(tname, index_col='RefId')
X_train.head()

fname = 'data\\features_test.csv'
X_test = load_features(fname=fname, cols_cat=cols_cat, cols_num=cols_num)
X_test = calc_price_diff(X_test)
tname = 'data\\target_test.csv'
y_test = features = pd.read_csv(tname, index_col='RefId')
X_test.head()




### 2.1 Select categorical and numerical columns


In [None]:
X_train.columns

In [None]:
cols_cat = ['Auction', 'VehicleAge', 'Make', 'Model', 'Trim', 'SubModel', 'WheelType', 'BYRNO', 'VNZIP1', # cols to use
            'Nationality', 'IsOnlineSale', 'Transmission', 'Color', 'TopThreeAmericanName', 'PRIMEUNIT', 'AUCGUART', 'Size', 'VNST', 'VehYear', 'WheelTypeID'] # cols to drop

cols_num = ['RetailClean', 'AcqClean', 'AcqRetail', 'AcqAuc',
            'VehOdo', 'VehBCost', 'WarrantyCost']

## 2. Tree-model
cross-validate 

In [None]:
# execution time= 34 s 
param_name ='max_depth'
param_range = [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18,]
score_dict = cross_validate_pipe(X=X_train,
                                y=y_train,
                                cols_cat=cols_cat,
                                cols_num=cols_num,
                                param_name=param_name,
                                param_range= param_range,
                                cv=5, 
                                max_cat=25,
                                estimator_name='tree',
                                n_jobs=-1,
                                )

plot_scores(score_dict, param_name='max_depth tree')
print_scores(score_dict)


## 2. Random Forest-model with whole dataset
cross-validate 

In [None]:
# execution time= 2m 12s 
param_name ='max_depth'
param_range = [3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18,]
score_dict = cross_validate_pipe(X=X_train,
                                y=y_train,
                                cols_cat=cols_cat,
                                cols_num=cols_num,
                                param_name=param_name,
                                param_range= param_range,
                                cv=5, 
                                max_cat=25,
                                estimator_name='forest', # 'forest', 'logistic', 'tree',
                                n_jobs=-1,
                                )

plot_scores(score_dict, param_name=param_name)
print_scores(score_dict)

## 2. Logistic Regresion - model with whole dataset
cross-validate 

In [None]:
param_name ='C'
param_range = [0.001, 0.002, 0.004, 0.01, 0.02, 0.04, 0.1, 1, 10,]
score_dict = cross_validate_pipe(X=X_train,
                                y=y_train,
                                cols_cat=cols_cat,
                                cols_num=cols_num,
                                param_name=param_name,
                                param_range= param_range,
                                cv=5, 
                                max_cat=25,
                                estimator_name='logistic', # 'forest', 'logistic', 'tree',
                                n_jobs=-1,
                                )

plot_scores(score_dict, param_name=param_name, xlog=True)
print_scores(score_dict)

Cross-Validation with price differencies
| model | parameter | f1_cv | precision  | recall | f1-score | support |
|---|---|---|---|---|---|---|
| Tree | depth=5 | 0.356 |- |- | - | - |
| Forest | depth=7 |0.379 | - | - |- | - |
| LogReg | C=0.01 | 0.375 | - | - |- | - |


# Predictoins

In [None]:

### Tree model #########################################################
pipe_tree = Pipeline(steps=[
('base', PrepareColsBase(cols_cat=cols_cat, cols_num=cols_num, max_cat=25).make_pipe()),
('model', DecisionTreeClassifier(class_weight='balanced', random_state=42, max_depth=5))
])

pipe_tree.fit(X_train, y_train['IsBadBuy'].to_numpy())

pred_tree = pipe_tree.predict(X_test)
df_pred_tree = pd.DataFrame(pred_tree, index=X_test.index)
df_pred_tree.columns = ['tree']

report = classification_report(y_test, pred_tree)
print('tree report: \n', report)
print('tree pred.sum():', pred_tree.sum())



In [None]:

### Forest model #########################################################
pipe_forest = Pipeline(steps=[
('base', PrepareColsBase(cols_cat=cols_cat, cols_num=cols_num, max_cat=25).make_pipe()),
('model', RandomForestClassifier(class_weight='balanced', random_state=42, max_depth=7))
])

pipe_forest.fit(X_train, y_train['IsBadBuy'].to_numpy())
pred_forest = pipe_forest.predict(X_test)
df_pred_forest = pd.DataFrame(pred_forest, index=X_test.index)
df_pred_forest.columns = ['forest']


report = classification_report(y_test, pred_forest)
print('forest report: \n', report)
print('forest pred.sum():', pred_forest.sum())


In [None]:
pipe_reg = Pipeline(steps=[
('preprocessing', PrepareColsBase(cols_cat=cols_cat, cols_num=cols_num, max_cat=25).make_pipe()),
('model', LogisticRegression(class_weight='balanced', random_state=42, C=0.01))
])

pipe_reg.fit(X_train, y_train['IsBadBuy'].to_numpy())
pred_reg = pipe_reg.predict(X_test)
df_pred_reg = pd.DataFrame(pred_reg, index=X_test.index)
df_pred_reg.columns = ['logistic']

report = classification_report(y_test, pred_reg)
print('regression report: \n', report)
print('logistig pred.sum():', pred_reg.sum())



Cross-Validation with price differencies
| model | parameter | f1_cv | precision  | recall | f1-score | support |
|---|---|---|---|---|---|---|
| Tree | depth=5 | 0.356 |0.31 |0.46 | 0.37 | 1945 |
| Forest | depth=7 |0.379 | 0.27  | 0.56 |0.37| 2636 |
| LogReg | C=0.01 | 0.375 |  0.26 | 0.63 |0.36 | 3173 |


## Ensamble

In [None]:
df_pred = pd.concat([df_pred_tree, df_pred_forest, df_pred_reg], axis=1)
df_pred.loc[:, 'sum'] = df_pred['tree'] + df_pred['forest'] + df_pred['logistic']
df_pred.loc[:, 'result1'] = round((df_pred['sum']+1)/3).astype(int)
df_pred.loc[:, 'result2'] = round((df_pred['sum']+0)/3).astype(int)
df_pred.loc[:, 'result3'] = round((df_pred['sum']-1)/3).astype(int)

display(df_pred.head())
print(df_pred.sum())

In [None]:
report = classification_report(y_test, df_pred['result3'])
print('result3 report: \n', report)

# report = classification_report(y_test, df_pred['result2'])
# print('result2: \n', report)

report = classification_report(y_test, df_pred['result1'])
print('result1: \n', report)

Cross-Validation with price differencies
| model | parameter | f1_cv | precision  | recall | f1-score | pred.sum() |
|---|---|---|---|---|---|---|
| Tree | depth=5 | 0.356 |0.31 |0.46 | 0.37 | 1945 |
| Forest | depth=7 |0.379 | 0.27  | 0.56 |0.37| 2636 |
| LogReg | C=0.01 | 0.375 |  0.26 | 0.63 |0.36 | 3173 |
|Ensamble prec| - |- | 0.40| 0.41|0.40| 1343|
|Ensamble recall| - |- |0.23|0.68|0.34|3873|

Cross-Validation with price differencies
| model | parameter | f1_cv | precision  | recall | f1-score | pred.sum() |
|---|---|---|---|---|---|---|
| Tree | depth=5 | 0.356 |0.31 |0.46 | 0.37 | 1945 |
| Forest | depth=7 |0.379 | 0.27  | 0.56 | 0.37 | 2636 |
| LogReg | C=0.01 | 0.375 |  0.26 | 0.63 | 0.36 | 3173 |
|Ensamble| - |- | 0.40 | 0.41 | 0.40 | 1343 |