In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from datetime import datetime
from dateutil.parser import parse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [None]:
df = pd.read_csv("training.csv")
df_test = pd.read_csv("test.csv")
df_test.shape

(48707, 33)

In [None]:
# auctions are weekly so we convert date string to week number
# starting from January 1, 2009 to December 31, 2010
def calendarLabeler(df):
  for i,j in df.iterrows():
    date = parse(df["PurchDate"][i])
    week = date.isocalendar()[1]
    year = date.isocalendar()[0]
    if (year == 2010):
      week += 53 # 53 weeks in 2009
    df.at[i,"PurchDate"] = week
  return df

df = calendarLabeler(df)
df_test = calendarLabeler(df_test)
df_test.shape

(48707, 33)

In [None]:
#Frequency Encoding for Model Trim and SubModel
def FreqEncode(df):
  columns=['Model', 'Trim', 'SubModel']
  fEncodeDF = df[columns]
  fEncodeDF.head()

  #Test comment to display the unique amount of elements per feature.
  #for col in fEncodeDF.columns[:]:
  #    print(col,':',len(fEncodeDF[col].unique()),'labels')

  #Create dictionaries for each column
  Model_map = fEncodeDF['Model'].value_counts().to_dict()
  Trim_map = fEncodeDF['Trim'].value_counts().to_dict()
  SubModel_map = fEncodeDF['SubModel'].value_counts().to_dict()

  df['Model'] = df['Model'].map(Model_map)
  df['Trim'] = df['Trim'].map(Trim_map)
  df['Trim'] = df['Trim'].fillna(0)
  df['SubModel'] = df['SubModel'].map(SubModel_map)
  df['SubModel'] = df['SubModel'].fillna(0)
  return df

df = FreqEncode(df)
df_test = FreqEncode(df_test)
df_test.head()
df_test.shape

(48707, 33)

In [None]:
#Prime Unit Mapping: Map NULL/NO to 0 and YES to 1
def primeMap(df):
  df['PRIMEUNIT'] = df['PRIMEUNIT'].map({'NO': 0, 'YES': 1})
  df['PRIMEUNIT'] = df['PRIMEUNIT'].fillna(0)
  return df

df = primeMap(df)
df_test = primeMap(df_test)
df_test.shape

(48707, 33)

In [None]:
#Auction Guarantee Mapping: Map NULL/RED to 0 and GREEN to 1
def aucMap(df):
  df['AUCGUART'] = df['AUCGUART'].map({'RED': 0, 'GREEN': 1})
  df['AUCGUART'] = df['AUCGUART'].fillna(0)
  return df

df = aucMap(df)
df_test = aucMap(df_test)
df_test.shape
df.columns

Index(['RefId', 'IsBadBuy', 'PurchDate', 'Auction', 'VehYear', 'VehicleAge',
       'Make', 'Model', 'Trim', 'SubModel', 'Color', 'Transmission',
       'WheelTypeID', 'WheelType', 'VehOdo', 'Nationality', 'Size',
       'TopThreeAmericanName', 'MMRAcquisitionAuctionAveragePrice',
       'MMRAcquisitionAuctionCleanPrice', 'MMRAcquisitionRetailAveragePrice',
       'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice',
       'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice',
       'MMRCurrentRetailCleanPrice', 'PRIMEUNIT', 'AUCGUART', 'BYRNO',
       'VNZIP1', 'VNST', 'VehBCost', 'IsOnlineSale', 'WarrantyCost'],
      dtype='object')

In [None]:
categorical_features = ["Auction", "Make", "Color","Transmission", "WheelType",
                          "Nationality", "Size", "TopThreeAmericanName", "PRIMEUNIT",
                          "AUCGUART", "VNST"]
numeric_features = [col for col in df.columns if col not in categorical_features]
numeric_features.remove("RefId")
numeric_features.remove("WheelTypeID")
numeric_features.remove("IsBadBuy")

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

X_train = df.drop(columns=['RefId','WheelTypeID'])

X_train = preprocessor.fit_transform(X_train)
Y_train = df["IsBadBuy"]
# Get the column names for the one-hot encoded features
onehot_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(input_features=categorical_features)
# Combine the numeric and one-hot encoded feature names
all_feature_names = numeric_features + list(onehot_columns)
# Convert the transformed matrix X back into a DataFrame with the feature names. Pandas provides functions for explorarory data analysis.

X_train = pd.DataFrame.sparse.from_spmatrix(X_train, columns=all_feature_names)

X_test = preprocessor.transform(df_test);
X_test = pd.DataFrame.sparse.from_spmatrix(X_test, columns=all_feature_names)

display(X_train)
display(X_test)

Unnamed: 0,PurchDate,VehYear,VehicleAge,Model,Trim,SubModel,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,...,VNST_OK,VNST_OR,VNST_PA,VNST_SC,VNST_TN,VNST_TX,VNST_UT,VNST_VA,VNST_WA,VNST_WV
0,-0.152884,0.379467,-0.687212,-0.659657,-1.065980,-0.609868,1.203528,0.822953,0.901887,0.994519,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.152884,-0.775775,0.480877,0.918951,-0.998734,-0.613594,1.515419,0.294516,0.370753,0.760382,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.152884,-0.198154,-0.103168,-0.060875,-0.431899,-0.496206,0.158244,-1.188846,-0.960023,-0.492365,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.152884,-0.775775,0.480877,-0.354120,-0.431899,1.899337,-0.403531,-1.720532,-1.725870,-1.216322,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.152884,-0.198154,-0.103168,0.581807,-1.111760,-0.645610,-0.146308,-0.900053,-0.852033,-0.245238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72978,-0.186123,-2.508637,2.233010,-0.693020,-1.102452,-0.649844,-1.801655,-1.678696,-1.609065,-1.850617,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72979,-0.186123,0.957087,-1.271257,1.136690,0.774147,0.117671,0.017766,0.117422,-0.017865,-0.337752,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72980,-0.186123,-0.198154,-0.103168,-0.533228,-0.987147,-0.609698,1.166076,0.981362,0.949638,0.390324,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72981,-0.186123,0.379467,-0.687212,2.479998,0.774147,0.117671,0.552446,0.118235,0.084616,-0.336801,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,PurchDate,VehYear,VehicleAge,Model,Trim,SubModel,VehOdo,MMRAcquisitionAuctionAveragePrice,MMRAcquisitionAuctionCleanPrice,MMRAcquisitionRetailAveragePrice,...,VNST_OK,VNST_OR,VNST_PA,VNST_SC,VNST_TN,VNST_TX,VNST_UT,VNST_VA,VNST_WA,VNST_WV
0,-0.186123,-0.198154,-0.103168,0.056775,0.670810,1.047803,0.951861,-0.445540,-0.362772,-0.811730,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.186123,-0.198154,-0.103168,-0.241738,0.104734,-0.172497,-0.660342,-0.660814,-0.620258,-0.993274,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.186123,0.379467,-0.687212,-0.371679,-1.125057,-0.670171,-0.152070,1.671459,2.081697,0.971708,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.186123,-1.931016,1.648965,-0.826473,-1.144622,-0.669324,1.124166,-1.450424,-1.407410,-1.658301,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.186123,0.957087,-1.271257,-0.775550,-1.110810,-0.668986,0.132521,-0.453258,-0.378566,-0.818384,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48702,1.475814,-0.198154,0.480877,-0.380459,-0.979359,-0.588694,1.176022,-0.313126,-0.197481,0.155236,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48703,1.475814,0.957087,-0.687212,-0.773794,-1.125437,-0.639850,0.710758,0.292485,0.227133,0.792699,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48704,1.475814,0.379467,-0.103168,-0.078434,-0.865764,-0.560405,0.751159,-0.595826,-0.631277,-0.166347,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48705,1.475814,-0.198154,0.480877,-0.733407,-1.129616,-0.618168,0.292205,-0.071450,0.291046,0.202761,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:

def MMRmean(X):
  X["MMRAcquisitionAuctionAveragePrice"] = X["MMRAcquisitionAuctionAveragePrice"].fillna(X["MMRAcquisitionAuctionAveragePrice"].mean())
  X["MMRAcquisitionAuctionCleanPrice"] = X["MMRAcquisitionAuctionCleanPrice"].fillna(X["MMRAcquisitionAuctionCleanPrice"].mean())
  X["MMRAcquisitionRetailAveragePrice"] = X["MMRAcquisitionRetailAveragePrice"].fillna(X["MMRAcquisitionRetailAveragePrice"].mean())
  X["MMRAcquisitonRetailCleanPrice"] = X["MMRAcquisitonRetailCleanPrice"].fillna(X["MMRAcquisitonRetailCleanPrice"].mean())
  X["MMRCurrentAuctionAveragePrice"] = X["MMRCurrentAuctionAveragePrice"].fillna(X["MMRCurrentAuctionAveragePrice"].mean())
  X["MMRCurrentAuctionCleanPrice"] = X["MMRCurrentAuctionCleanPrice"].fillna(X["MMRCurrentAuctionCleanPrice"].mean())
  X["MMRCurrentRetailAveragePrice"] = X["MMRCurrentRetailAveragePrice"].fillna(X["MMRCurrentRetailAveragePrice"].mean())
  X["MMRCurrentRetailCleanPrice"] = X["MMRCurrentRetailCleanPrice"].fillna(X["MMRCurrentRetailCleanPrice"].mean())
  return X

X_train = MMRmean(X_train)
X_test = MMRmean(X_test)

for i in range(len(numeric_features)):
  print(numeric_features[i], X_train[numeric_features[i]].isna().sum())

print("--------------------------")

for i in range(len(numeric_features)):
  print(numeric_features[i], X_test[numeric_features[i]].isna().sum())

PurchDate 0
VehYear 0
VehicleAge 0
Model 0
Trim 0
SubModel 0
VehOdo 0
MMRAcquisitionAuctionAveragePrice 0
MMRAcquisitionAuctionCleanPrice 0
MMRAcquisitionRetailAveragePrice 0
MMRAcquisitonRetailCleanPrice 0
MMRCurrentAuctionAveragePrice 0
MMRCurrentAuctionCleanPrice 0
MMRCurrentRetailAveragePrice 0
MMRCurrentRetailCleanPrice 0
BYRNO 0
VNZIP1 0
VehBCost 0
IsOnlineSale 0
WarrantyCost 0
--------------------------
PurchDate 0
VehYear 0
VehicleAge 0
Model 0
Trim 0
SubModel 0
VehOdo 0
MMRAcquisitionAuctionAveragePrice 0
MMRAcquisitionAuctionCleanPrice 0
MMRAcquisitionRetailAveragePrice 0
MMRAcquisitonRetailCleanPrice 0
MMRCurrentAuctionAveragePrice 0
MMRCurrentAuctionCleanPrice 0
MMRCurrentRetailAveragePrice 0
MMRCurrentRetailCleanPrice 0
BYRNO 0
VNZIP1 0
VehBCost 0
IsOnlineSale 0
WarrantyCost 0


In [None]:
clf = GradientBoostingClassifier(n_estimators=250, learning_rate=.6,
       max_depth=5, random_state=0).fit(X_train, Y_train)


In [None]:
pred = clf.predict_proba(X_test)[:,1]
pred.shape

(48707,)

In [None]:
df_output = pd.DataFrame(columns=['RefId', 'IsBadBuy'])
df_output['RefId'] = df_test['RefId']
df_output['IsBadBuy'] = pred
df_output.to_csv("GBCout6.csv", index=False)
df_output.head()

Unnamed: 0,RefId,IsBadBuy
0,73015,0.014492
1,73016,0.035206
2,73017,0.017786
3,73018,0.084866
4,73019,0.996785


Purchase Date - Condense to months and years and one
hot encode<br>
Auction Date - one hot encode<br>
Vehicle year - leave as is<br>
Vehicle age - leave as is<br>
Make - one hot encode<br>
Model - <br>
Trim - <br>
SubModel - <br>
Color - one hot encode<br>
transmission - one hot encode<br>
wheelTypeId - leave as is<br>
wheelType - DROP<br>
odometer - leave as is<br>
nationality - one hot encode<br>
size - one hot encode<br>
top3 - one hot encode<br>
MMRones - leave as is<br>
prime unit - map them to 0 and 1<br>
aucguart - <br>