In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as ImbPipeline
import numpy as np
import zipfile

# Define paths
zip_path = '/workspaces/Isabell-Joane-Eric-Final-Project/src/DontGetKicked.zip'
extract_to = '/workspaces/Isabell-Joane-Eric-Final-Project/data/processed'

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to)

In [3]:
# Load the data
total_data = pd.read_csv(f'{extract_to}/training.csv')

total_data.head()

Unnamed: 0,RefId,IsBadBuy,PurchDate,Auction,VehYear,VehicleAge,Make,Model,Trim,SubModel,...,MMRCurrentRetailAveragePrice,MMRCurrentRetailCleanPrice,PRIMEUNIT,AUCGUART,BYRNO,VNZIP1,VNST,VehBCost,IsOnlineSale,WarrantyCost
0,1,0,12/7/2009,ADESA,2006,3,MAZDA,MAZDA3,i,4D SEDAN I,...,11597.0,12409.0,,,21973,33619,FL,7100.0,0,1113
1,2,0,12/7/2009,ADESA,2004,5,DODGE,1500 RAM PICKUP 2WD,ST,QUAD CAB 4.7L SLT,...,11374.0,12791.0,,,19638,33619,FL,7600.0,0,1053
2,3,0,12/7/2009,ADESA,2005,4,DODGE,STRATUS V6,SXT,4D SEDAN SXT FFV,...,7146.0,8702.0,,,19638,33619,FL,4900.0,0,1389
3,4,0,12/7/2009,ADESA,2004,5,DODGE,NEON,SXT,4D SEDAN,...,4375.0,5518.0,,,19638,33619,FL,4100.0,0,630
4,5,0,12/7/2009,ADESA,2005,4,FORD,FOCUS,ZX3,2D COUPE ZX3,...,6739.0,7911.0,,,19638,33619,FL,4000.0,0,1020


In [4]:

# Convert WheelTypeID to 'category'
total_data['WheelTypeID'] = total_data['WheelTypeID'].astype('category')

# Group zip codes by the first two digits into a new column 'ZipRegion'
total_data['ZipRegion'] = total_data['VNZIP1'].astype(str).str[:2]

# Remove specified columns
columns_to_remove = ["AUCGUART", "PRIMEUNIT", "VNST", "VNZIP1", "BYRNO", "PurchDate", "RefId", "SubModel", "Color", "WheelType"]
total_data.drop(columns=columns_to_remove, axis=1, inplace=True)



In [5]:
# Define categorical and numerical columns
categorical_columns = [
    'Auction', 'Transmission', 'WheelTypeID', 'Nationality', 'TopThreeAmericanName', 'IsOnlineSale', 'Size', 'ZipRegion'
]

numerical_columns = [
    'VehYear', 'VehicleAge', 'VehOdo', 'MMRAcquisitionAuctionAveragePrice', 'MMRAcquisitionAuctionCleanPrice',
    'MMRAcquisitionRetailAveragePrice', 'MMRAcquisitonRetailCleanPrice', 'MMRCurrentAuctionAveragePrice',
    'MMRCurrentAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'VehBCost',
    'WarrantyCost'
]

# Ensure categorical columns are of type 'str'
for column in categorical_columns:
    total_data[column] = total_data[column].astype(str)

# Ensure numerical columns are of type 'float'
for column in numerical_columns:
    total_data[column] = total_data[column].astype(float)

# Define feature sets for PCA
currentauction_features = [
    "MMRCurrentAuctionAveragePrice", "MMRCurrentAuctionCleanPrice", "MMRCurrentRetailAveragePrice", "MMRCurrentRetailCleanPrice"
]
acquisitionauction_features = [
    "MMRAcquisitionAuctionAveragePrice", "MMRAcquisitionAuctionCleanPrice", "MMRAcquisitionRetailAveragePrice", "MMRAcquisitonRetailCleanPrice"
]
remaining_features = [
    'Auction', 'VehYear', 'VehicleAge', 'Make', 'Model', 'Trim', 'Transmission', 'WheelTypeID', 'VehOdo', 'Nationality', 
    'Size', 'TopThreeAmericanName', 'VehBCost', 'IsOnlineSale', 'WarrantyCost', 'ZipRegion'
]

In [6]:

# Split the data into training and testing sets
X = total_data[remaining_features + currentauction_features + acquisitionauction_features]
y = total_data['IsBadBuy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)



In [8]:
# Define transformers for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine all preprocessors into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

preprocessor

In [9]:

# Create a pipeline with SMOTE and KNN
pipeline_knn_smote = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(sampling_strategy=0.2, random_state=42)),
    ('knn', KNeighborsClassifier())
])

pipeline_knn_smote

In [10]:
# Define the parameter grid for RandomizedSearchCV
param_distributions = {
    'knn__n_neighbors': range(1, 30),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'minkowski']
}

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline_knn_smote, param_distributions, n_iter=20, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGTERM(-15)}

In [None]:
# Print the best parameters and best cross-validation accuracy
print("Best Parameters found: ", random_search.best_params_)
print("Best cross-validation accuracy: ", random_search.best_score_)

# Predict on the test set using the best model from RandomizedSearchCV
best_knn_model = random_search.best_estimator_
y_pred_knn_smote = best_knn_model.predict(X_test)

# Generate classification report
report_knn_smote = classification_report(y_test, y_pred_knn_smote)
print("Classification Report with SMOTE and KNN:")
print(report_knn_smote)
