In [1]:
import pandas as pd
from antakia.utils.examples import fetch_dataset
vh_sales_df = fetch_dataset('car_prices')
vh_sales_df.head()

Unnamed: 0,year,make,model,trim,body,transmission,vin,state,condition,odometer,color,interior,seller,mmr,sellingprice,saledate
0,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg566472,ca,5.0,16639.0,white,black,kia motors america inc,20500.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
1,2015,Kia,Sorento,LX,SUV,automatic,5xyktca69fg561319,ca,5.0,9393.0,white,beige,kia motors america inc,20800.0,21500.0,Tue Dec 16 2014 12:30:00 GMT-0800 (PST)
2,2014,BMW,3 Series,328i SULEV,Sedan,automatic,wba3c1c51ek116351,ca,45.0,1331.0,gray,black,financial services remarketing (lease),31900.0,30000.0,Thu Jan 15 2015 04:30:00 GMT-0800 (PST)
3,2015,Volvo,S60,T5,Sedan,automatic,yv1612tb4f1310987,ca,41.0,14282.0,white,black,volvo na rep/world omni,27500.0,27750.0,Thu Jan 29 2015 04:30:00 GMT-0800 (PST)
4,2014,BMW,6 Series Gran Coupe,650i,Sedan,automatic,wba6b2c57ed129731,ca,43.0,2641.0,gray,black,financial services remarketing (lease),66000.0,67000.0,Thu Dec 18 2014 12:30:00 GMT-0800 (PST)


In [2]:
!pip install seaborn



In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from dateutil import parser
import sys
import time
import datetime
import os

from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score, mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDRegressor, ElasticNet 
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.svm import LinearSVR, SVR, NuSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

In [4]:
def process_missingvalue(df, method='simple_input', mse_flag=False):
    
    """
    To process missing values in the dataset

    :param df: DataFrame. Input a dataframe
    :param method: str. Choose a method to process missing values. simple_input, rm_null, KNNImputer, RFImputer. simple_input is default value.
        simple_input: object - mode; int or float - mean or median
        rm_null: Remove rows containing missing values
        KNNImputer: input missing values by knn
        RFImputer: input missing values by randomforestregressor
    :param mse_flag: Boolean. To calculate the MSE. False is Default value.
    
    :return ret_df: return a new dataframe
    """
    
    assert method in ('simple_input', 'rm_null', 'KNNImputer', 'RFImputer')
    temp_df = df.copy()
    
    if method == 'simple_input':
        null_cols = [i for i in temp_df.columns if temp_df[i].isnull().sum()!=0]
        for i in null_cols:
            if temp_df[i].dtypes == 'object':
                temp_df[i] = temp_df[i].fillna(temp_df[i].mode().values[0])
            else:
                if temp_df[i].std() >= 20:
                    temp_df[i] = temp_df[i].fillna(temp_df[i].quantile(q=0.5))
                else:
                    temp_df[i] = temp_df[i].fillna(temp_df[i].mean())
        ret_df = temp_df
    elif method == 'rm_null':
        ret_df = temp_df.dropna(axis=0, how='any')                   
    elif method == 'RFImputer':
        sortindex = temp_df.isnull().sum().sort_values().index
        temp_df = convert_2_num(temp_df)
        for i in tqdm(sortindex):
            if temp_df[i].isnull().sum() == 0:
                continue
            
            x_df = temp_df.copy()
            fill_col = x_df[i]
            x_df = x_df.loc[:, temp_df.columns != i]
            x_df = x_df.fillna(0)
            
            y_train = fill_col[fill_col.notnull()]
            y_test = fill_col[fill_col.isnull()]
            x_train = x_df.loc[y_train.index, :]
            x_test = x_df.loc[y_test.index, :]
        
            rfr = RandomForestRegressor(n_estimators=50)
            rfr.fit(x_train, y_train)
            y_pred = rfr.predict(x_test)
            temp_df.loc[temp_df.loc[:, i].isnull(), i] = y_pred
        temp_df = temp_df.drop(['saledate'], axis=1)
        temp_df['saledate'] = df['saledate'].fillna(df['saledate'].mode()[0])
        ret_df = temp_df
    else:
        temp_df = convert_2_num(temp_df)
        knn_imputer = KNNImputer()
        knn_df = knn_imputer.fit_transform(temp_df)
        temp_df = pd.DataFrame(knn_df, columns=df.columns)
        temp_df = temp_df.drop(['saledate'], axis=1)
        temp_df['saledate'] = df['saledate'].fillna(df['saledate'].mode()[0])
        ret_df = temp_df
    

    if mse_flag:
        process_missingvalue_mse(ret_df, method)
    
    return pd.DataFrame(ret_df)



def convert_2_num(df):
    for col in df:
        if df[col].dtypes != 'object':
            continue
        df[col] = df[col].factorize()[0]
    return df


def process_missingvalue_mse(ret_df, method):
    ret_df = convert_2_num(ret_df)
    estimator = RandomForestRegressor(n_estimators=100, random_state=0)
    mse_score = cross_val_score(estimator, ret_df.loc[:, ret_df.columns != 'sellingprice'], ret_df['sellingprice'], scoring='neg_mean_squared_error', cv=3).mean()
    print('after processing of %s: ' % method, mse_score * -1)

In [5]:
clean_df = vh_sales_df.copy()
clean_df = process_missingvalue(clean_df, 'rm_null')
clean_df = clean_df.drop_duplicates(keep='first', subset=['vin'])
clean_df = clean_df.drop(['vin'], axis=1)
clean_df.shape

(465768, 15)

In [6]:
def process_outliers(df, k=3, method='simple_input', mse_flag=False):

    """
    Replace the outlier as a null value, and call the above func to handle the null value.

    :param df: DataFrame. Input a dataframe.
    :param k: int. Default = 3. 
    :param method: str. Choose a method to process missing values. simple_input, rm_null, KNNImputer, RFImputer. simple_input is default value.
        simple_input: object - mode; int or float - mean or median
        rm_null: Remove rows containing missing values
        KNNImputer: input missing values by knn
        RFImputer: input missing values by randomforestregressor
    
    :return ret_df: return a new dataframe
    """
    
    for i in tqdm(df.columns):
        if df[i].dtypes == 'object':
            continue
        top_75 = df[i].quantile(q=0.75)
        bottom_25 = df[i].quantile(q=0.25)
        interval = top_75 - bottom_25
        top_line = top_75 + k * interval
        bottom_line = bottom_25 - k * interval
        df[i] = df[i].where((df[i] <= top_line)&(df[i] >= bottom_line), other=None)
    
    df = process_missingvalue(df, method, mse_flag)
    ret_df = df.reset_index(drop=True)
    return ret_df


def process_object(df):
    """
    Check the sellingdate and year.

    :param df: DataFrame. Input a dataframe

    :return ret_df: return a new dataframe
    """

    df['year'] = df['year'].astype('int32')
    df['saledate'] = df['saledate'].apply(lambda x: x.split(' '))
    
    for i in range(df.shape[0]):
        if len(df['saledate'][i]) != 7:
            df = df.drop(index=i, axis=0)

    ret_df = df.reset_index(drop=True)
    return ret_df
clean_df = process_outliers(clean_df, method='rm_null')
clean_df = process_object(clean_df)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 221.25it/s]


In [7]:
def process_date(df):
    temp_df = df.copy()
    temp_df['new_date'] = temp_df['saledate'].apply(lambda x: parser.parse(' '.join(x)))
    temp_df = temp_df.sort_values(by=['new_date'], ascending=True)
    ret_df = temp_df.copy()
    return ret_df


is_nn_flag = True # true: return a new df as an ascending order
if is_nn_flag:
    clean_df = process_date(clean_df)
preprocess_df = clean_df.copy()
 
preprocess_df['saleweeks'] = preprocess_df['saledate'].apply(lambda x: x[0])
preprocess_df['salemonth'] = preprocess_df['saledate'].apply(lambda x: x[1])
preprocess_df['saleyear'] = preprocess_df['saledate'].apply(lambda x: x[3])

In [8]:
preprocess_df['saleweeks'] = preprocess_df['saleweeks'].map({'Mon': 1, 'Tue': 2, 'Wed': 3, 'Thu': 4, 'Fri': 5, 'Sat': 6, 'Sun': 7})
preprocess_df['salemonth'] = preprocess_df['salemonth'].map({'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12})
preprocess_df['saleyear'] = preprocess_df['saleyear'].astype('int32')
preprocess_df['gapyear'] = preprocess_df['saleyear'] - preprocess_df['year']
preprocess_df = preprocess_df.drop(['saledate'], axis=1)

for i in preprocess_df.columns:
    if preprocess_df[i].dtypes != 'object':
        continue
    preprocess_df[i] = preprocess_df[i].factorize()[0]
    
print(preprocess_df.info())
preprocess_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 462267 entries, 34863 to 160408
Data columns (total 19 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   year          462267 non-null  int32  
 1   make          462267 non-null  int64  
 2   model         462267 non-null  int64  
 3   trim          462267 non-null  int64  
 4   body          462267 non-null  int64  
 5   transmission  462267 non-null  int64  
 6   state         462267 non-null  int64  
 7   condition     462267 non-null  float64
 8   odometer      462267 non-null  float64
 9   color         462267 non-null  int64  
 10  interior      462267 non-null  int64  
 11  seller        462267 non-null  int64  
 12  mmr           462267 non-null  float64
 13  sellingprice  462267 non-null  float64
 14  new_date      462267 non-null  int64  
 15  saleweeks     462267 non-null  int64  
 16  salemonth     462267 non-null  int64  
 17  saleyear      462267 non-null  int32  
 18  gapye

Unnamed: 0,year,make,model,trim,body,transmission,state,condition,odometer,color,interior,seller,mmr,sellingprice,new_date,saleweeks,salemonth,saleyear,gapyear
34863,2014,0,0,0,0,0,0,28.0,34169.0,0,0,0,15800.0,15000.0,0,3,1,2014,0
17848,2014,1,1,1,1,0,0,3.0,33959.0,1,0,1,16450.0,14800.0,0,3,1,2014,0
46356,2014,0,2,2,1,0,0,27.0,27495.0,0,1,0,14800.0,13800.0,0,3,1,2014,0
49083,2013,0,3,3,1,0,0,41.0,55209.0,2,1,2,9900.0,9500.0,0,3,1,2014,1
33832,2013,2,4,4,1,0,0,34.0,33069.0,1,0,3,16000.0,16000.0,0,3,1,2014,1


In [9]:
def process_datascaler(df, method='standard'):
    """
    MinMaxScaler or StandardScaler.

    :param df: DataFrame. Input a dataframe.
    :param method: str. minmax or standard. Default value is standard.
    
    :return ret_df: DataFrame. return a new dataframe
    """

    assert method in ('minmax', 'standard')
    temp_df = df.copy()
    
    if method == 'minmax':
        scaler = MinMaxScaler()
        temp_df = scaler.fit_transform(temp_df)
    else:
        scaler = StandardScaler()
        temp_df = scaler.fit_transform(temp_df)

    ret_df = pd.DataFrame(temp_df, columns=df.columns, index=df.index)
    return ret_df

In [10]:
X_df = process_datascaler(df=preprocess_df.loc[:, preprocess_df.columns != 'sellingprice'])
Y_label = preprocess_df['sellingprice']
X_df.head()

Unnamed: 0,year,make,model,trim,body,transmission,state,condition,odometer,color,interior,seller,mmr,new_date,saleweeks,salemonth,saleyear,gapyear
34863,0.995076,-1.024166,-0.99532,-0.614028,-0.59587,-0.188286,-1.480429,-0.207974,-0.645215,-0.956185,-0.799638,-0.601611,0.264745,-1.639907,-0.047044,-0.860665,-3.418588,-1.237917
17848,0.995076,-0.915338,-0.986775,-0.608365,-0.532384,-0.188286,-1.480429,-2.092125,-0.6494,-0.683152,-0.799638,-0.601129,0.340804,-1.639907,-0.047044,-0.860665,-3.418588,-1.237917
46356,0.995076,-1.024166,-0.97823,-0.602702,-0.532384,-0.188286,-1.480429,-0.28334,-0.778226,-0.956185,-0.13072,-0.601611,0.147732,-1.639907,-0.047044,-0.860665,-3.418588,-1.237917
49083,0.732347,-1.024166,-0.969685,-0.597039,-0.532384,-0.188286,-1.480429,0.771784,-0.225892,-0.410118,-0.13072,-0.600647,-0.425632,-1.639907,-0.047044,-0.860665,-3.418588,-0.975014
33832,0.732347,-0.80651,-0.961141,-0.591376,-0.532384,-0.188286,-1.480429,0.244222,-0.667138,-0.683152,-0.799638,-0.600165,0.288148,-1.639907,-0.047044,-0.860665,-3.418588,-0.975014


In [11]:
def process_fea_selection(x, y, method='skb', k=10):
    """
    Choose the important features.

    :param x: DataFrame. Input a dataframe.
    :param y: Series. Input a Series.
    :param method: str. Input a method to select feature. Defalut = 'skb'.
        skb: SelectKBest
        sfm: SelectFromModel
        rfe: RFE
    :param k: int or float. The number of features need to be retained. Default = 10.

    :return x_df: DataFrame. Return a new dataframe.
    """

    assert method in ('skb', 'sfm', 'rfe')

    if 0 < k <= 1:
        k = int(len(x.columns) * k)
    selectors = {'skb': SelectKBest(k=k), 'sfm': SelectFromModel(estimator=XGBRegressor()), 'rfe': RFE(estimator=XGBRegressor(), n_features_to_select=k)}
    selector = selectors[method]
    selector.fit(x, y)
    
    return x[selector.get_feature_names_out()]
X_df = process_fea_selection(X_df, Y_label, k=10)

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X_df, Y_label, test_size=0.2, random_state=0)

print('X_train:', X_train.shape)
print('X_test:', X_test.shape)
print('Y_train:', len(Y_train))
print('Y_test:', len(Y_test))

X_train: (369813, 10)
X_test: (92454, 10)
Y_train: 369813
Y_test: 92454


In [13]:
def eva_model(Y_test, Y_pred):
    print('R2: %.2f' % r2_score(Y_test, Y_pred))
    print('MAPE: %.2f' % mean_absolute_percentage_error(Y_test, Y_pred))


def gridsearch_func(estimator, param_grid, scoring='r2', cv=3):
    gscv = GridSearchCV(estimator=estimator, param_grid=params, scoring=scoring, cv=cv)
    gscv.fit(X_train, Y_train)
    print('best params: ', gscv.best_params_)
    print('best score: ', gscv.best_score_)
    
    return gscv.best_params_

In [14]:
gridsearch_knn_flag = False

if gridsearch_knn_flag:
    params = {'n_neighbors': range(3, 9), 'weights': ['uniform', 'distance']}
    gs = gridsearch_func(estimator=KNeighborsRegressor(), param_grid=params)
    knr = KNeighborsRegressor(n_neighbors=gs['n_neighbors'], weights=gs['weights'])
else:
    knr = KNeighborsRegressor(n_neighbors=8, weights='distance')
    
#knr.fit(X_train, Y_train)
#Y_pred = knr.predict(X_test)
#eva_model(Y_test, Y_pred)

In [15]:
linear_models = ['LinearRegression()', 'Lasso()', 'Ridge()', 'SGDRegressor()', 'ElasticNet()']

for i in linear_models:
    model = eval(i)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    print(i[:-2], ':')
    eva_model(Y_test, Y_pred)

LinearRegression :
R2: 0.97
MAPE: 0.15
Lasso :
R2: 0.97
MAPE: 0.15
Ridge :
R2: 0.97
MAPE: 0.15
SGDRegressor :
R2: 0.97
MAPE: 0.15
ElasticNet :
R2: 0.86
MAPE: 0.36


In [16]:
gridsearch_dtr_flag = False

if gridsearch_dtr_flag:
    params = {'max_depth': [None, 5, 10, 15, 20], 'min_samples_split': range(2, 7), 'min_samples_leaf': range(1, 6)}
    gs = gridsearch_func(estimator=DecisionTreeRegressor(), param_grid=params)
    dtr = DecisionTreeRegressor(max_depth=gs['max_depth'], min_samples_split=gs['min_samples_split'], min_samples_leaf=gs['min_samples_leaf'])
else:
    dtr = DecisionTreeRegressor(max_depth=10, min_samples_leaf=4, min_samples_split=4)
    
#dtr.fit(X_train, Y_train)
#Y_pred = dtr.predict(X_test)
#eva_model(Y_test, Y_pred)

In [17]:
# best params:  {'max_depth': 15, 'min_samples_leaf': 4, 'min_samples_split': 4}
gridsearch_etr_flag = False

if gridsearch_etr_flag:
    params = {'max_depth': [None, 5, 10, 15, 20], 'min_samples_split': range(2, 7), 'min_samples_leaf': range(1, 6)}
    gs = gridsearch_func(estimator=ExtraTreeRegressor(), param_grid=params)
    etr = ExtraTreeRegressor(max_depth=gs['max_depth'], min_samples_split=gs['min_samples_split'], min_samples_leaf=gs['min_samples_leaf'])
else:
    etr = ExtraTreeRegressor(max_depth=15, min_samples_leaf=4, min_samples_split=4)
    
#etr.fit(X_train, Y_train)
#Y_pred = etr.predict(X_test)
#eva_model(Y_test, Y_pred)

In [18]:
gridsearch_svr_flag = False

if gridsearch_svr_flag:
    params = {'C': [0.2, 0.4, 0.8, 1]}
    gs = gridsearch_func(estimator=LinearSVR(), param_grid=params)
    svr = LinearSVR(C=gs['C'])
else:
    svr = LinearSVR(C=0.8)
    
#svr.fit(X_train, Y_train)
#Y_pred = svr.predict(X_test)
#eva_model(Y_test, Y_pred)

In [19]:
gridsearch_rfr_flag = False

if gridsearch_rfr_flag:
    params = {'n_estimators': [100, 200, 400]}
    gs = gridsearch_func(estimator=RandomForestRegressor(), param_grid=params)
    rfr = RandomForestRegressor(n_estimators=gs['n_estimators'])
else:
    rfr = RandomForestRegressor(n_estimators=100)
    
#rfr.fit(X_train, Y_train)
#Y_pred = rfr.predict(X_test)
#eva_model(Y_test, Y_pred)

In [20]:
gridsearch_gbdt_flag = False

if gridsearch_gbdt_flag:
    params = {'n_estimators': [100, 200, 400], 'learning_rate': [0.05, 0.1], 'max_depth': [8, 9, 10]}
    gs = gridsearch_func(estimator=GradientBoostingRegressor(), param_grid=params)
    gbr = GradientBoostingRegressor(n_estimators=gs['n_estimators'], learning_rate=gs['learning_rate'], max_depth=gs['max_depth'])
else:
    gbr = GradientBoostingRegressor()
    
#gbr.fit(X_train, Y_train)
#Y_pred = gbr.predict(X_test)
#eva_model(Y_test, Y_pred)

In [21]:
gridsearch_xgbt_flag = False

if gridsearch_xgbt_flag:
    params = {
        'learning_rate': [0.05, 0.1], 
        'gamma': [0.7, 0.9, 1],
        'max_depth': [7, 10, 15]
    }
    gs = gridsearch_func(estimator=XGBRegressor(), param_grid=params)
    xgbr = XGBRegressor(gamma=gs['gamma'], learning_rate=gs['learning_rate'], max_depth=gs['max_depth'])
else:
    xgbr = XGBRegressor(gamma=0.7, learning_rate=0.1, max_depth=7, n_estimators=1000)
    
xgbr.fit(X_train, Y_train)
Y_pred = xgbr.predict(X_test)
eva_model(Y_test, Y_pred)

R2: 0.98
MAPE: 0.12


In [22]:
X_train.shape

(369813, 10)

In [23]:
from antakia.antakia import AntakIA
atk = AntakIA(X_train, Y_train, xgbr, X_test=X_test, y_test=Y_test)

In [None]:
atk.start_gui()

Layout(children=[Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\t2\x00\x00\n\xd5\x08\x06\x00\x00\x0…

Col(children=[AppBar(children=[Sheet(children=[Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x04\x…

using tree shap
