In [None]:
import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Loading Dataset

In [None]:

train = pd.read_csv('/content/drive/MyDrive/Data Science Nigeria Hackathon/archive/train.csv')

In [None]:

test = pd.read_csv('/content/drive/MyDrive/Data Science Nigeria Hackathon/archive/test.csv')

In [None]:
train.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,MINI,Cooper S Base,2007,213000,Gasoline,172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel,A/T,Yellow,Gray,None reported,Yes,4200
1,1,Lincoln,LS V8,2002,143250,Gasoline,252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel,A/T,Silver,Beige,At least 1 accident or damage reported,Yes,4999
2,2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...,A/T,Blue,Gray,None reported,Yes,13900
3,3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,45000
4,4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Black,Beige,None reported,Yes,97500


In [None]:
test.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,188533,Land,Rover LR2 Base,2015,98000,Gasoline,240.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,White,Beige,None reported,Yes
1,188534,Land,Rover Defender SE,2020,9142,Hybrid,395.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,Silver,Black,None reported,Yes
2,188535,Ford,Expedition Limited,2022,28121,Gasoline,3.5L V6 24V PDI DOHC Twin Turbo,10-Speed Automatic,White,Ebony,None reported,
3,188536,Audi,A6 2.0T Sport,2016,61258,Gasoline,2.0 Liter TFSI,Automatic,Silician Yellow,Black,None reported,
4,188537,Audi,A6 2.0T Premium Plus,2018,59000,Gasoline,252.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Black,None reported,Yes


In [None]:
train.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title',
       'price'],
      dtype='object')

In [None]:
test.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'clean_title'],
      dtype='object')

In [None]:
train['car_age'] = 2025 - train['model_year']
test['car_age'] = 2025 - test['model_year']

In [None]:
train.shape

(188533, 14)

In [None]:
train.milage.describe()

Unnamed: 0,milage
count,188533.0
mean,65705.295174
std,49798.158076
min,100.0
25%,24115.0
50%,57785.0
75%,95400.0
max,405000.0


In [None]:

# Drop clean_title from both train and test
train = train.drop(columns=['clean_title'])
test = test.drop(columns=['clean_title'])

In [None]:
# ------------------------------
# Feature Engineering Function
# ------------------------------
def add_features(df):


    # Engine × Transmission interaction
    df["engine_transmission"] = df["engine"].astype(str) + "_" + df["transmission"].astype(str)



    return df

# ------------------------------
# Apply to both train and test
# ------------------------------
train = add_features(train)
test = add_features(test)

In [None]:
train.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'price', 'car_age',
       'engine_transmission'],
      dtype='object')

In [None]:
train["color_combo"] = train["ext_col"].astype(str) + "_" + train["int_col"].astype(str)
test["color_combo"]  = test["ext_col"].astype(str) + "_" + test["int_col"].astype(str)

In [None]:

# Interaction: exterior color × model
train["model_ext_color"] = train["model"].astype(str) + "_" + train["ext_col"].astype(str)
test["model_ext_color"]  = test["model"].astype(str) + "_" + test["ext_col"].astype(str)

In [None]:
# Interaction: brand × model
train["brand_model"] = train["brand"].astype(str) + "_" + train["model"].astype(str)
test["brand_model"]  = test["brand"].astype(str) + "_" + test["model"].astype(str)

In [None]:
# Define luxury brands based on outlier analysis
luxury_brands = ['Porsche', 'Mercedes-Benz', 'BMW', 'Bentley', 'Lamborghini', 'Land']

# Create binary flag for luxury vs regular
train['is_luxury_brand'] = train['brand'].isin(luxury_brands)
train['is_regular_brand'] = ~train['brand'].isin(luxury_brands)

# Create binary flag for luxury vs regular
test['is_luxury_brand'] = test['brand'].isin(luxury_brands)
test['is_regular_brand'] = ~test['brand'].isin(luxury_brands)

In [None]:
# Define luxury models based on outlier analysis
luxury_models = [
    '911 Carrera S', '911 Carrera', 'AMG G 63 Base', '911 Turbo S',
    '911 GT3', '911 Carrera 4S', 'Corvette Stingray w/2LT',
    'Rover Range Rover P530 SE LWB 7 Seat', 'Corvette Stingray w/3LT',
    'Urus Base'
]

# Create binary flag for luxury vs regular models
train['is_luxury_model'] = train['model'].isin(luxury_models)
train['is_regular_model'] = ~train['model'].isin(luxury_models)

test['is_luxury_model'] = test['model'].isin(luxury_models)
test['is_regular_model'] = ~test['model'].isin(luxury_models)

In [None]:
# Create binary flag for high mileage cars
train['high_milage_flag'] = (train['milage'] > 200000).astype(int)
test['high_milage_flag'] = (test['milage'] > 200000).astype(int)

In [None]:
# Create decade column
train['model_decade'] = (train['model_year'] // 10) * 10
test['model_decade']  = (test['model_year'] // 10) * 10

In [None]:
# --- Interactions with high mileage ---
train['luxury_model_high_milage']  = train['is_luxury_model'].astype(int) * train['high_milage_flag']
train['regular_model_high_milage'] = train['is_regular_model'].astype(int) * train['high_milage_flag']

train['luxury_brand_high_milage']  = train['is_luxury_brand'].astype(int) * train['high_milage_flag']
train['regular_brand_high_milage'] = train['is_regular_brand'].astype(int) * train['high_milage_flag']

test['luxury_model_high_milage']  = test['is_luxury_model'].astype(int) * test['high_milage_flag']
test['regular_model_high_milage'] = test['is_regular_model'].astype(int) * test['high_milage_flag']

test['luxury_brand_high_milage']  = test['is_luxury_brand'].astype(int) * test['high_milage_flag']
test['regular_brand_high_milage'] = test['is_regular_brand'].astype(int) * test['high_milage_flag']


# --- Interactions with model_decade ---
# (Here we keep them as categorical combos instead of numeric multiplications)
train['luxury_model_decade']  = train['is_luxury_model'].astype(int).astype(str) + "_" + train['model_decade'].astype(str)
train['regular_model_decade'] = train['is_regular_model'].astype(int).astype(str) + "_" + train['model_decade'].astype(str)

train['luxury_brand_decade']  = train['is_luxury_brand'].astype(int).astype(str) + "_" + train['model_decade'].astype(str)
train['regular_brand_decade'] = train['is_regular_brand'].astype(int).astype(str) + "_" + train['model_decade'].astype(str)

test['luxury_model_decade']  = test['is_luxury_model'].astype(int).astype(str) + "_" + test['model_decade'].astype(str)
test['regular_model_decade'] = test['is_regular_model'].astype(int).astype(str) + "_" + test['model_decade'].astype(str)

test['luxury_brand_decade']  = test['is_luxury_brand'].astype(int).astype(str) + "_" + test['model_decade'].astype(str)
test['regular_brand_decade'] = test['is_regular_brand'].astype(int).astype(str) + "_" + test['model_decade'].astype(str)

In [None]:
import pandas as pd

# Identify all object columns in train (same assumed in test)
obj_cols = train.select_dtypes(include=['object']).columns

for col in obj_cols:
    # Compute frequencies on the combined dataset to ensure consistency
    combined = pd.concat([train[col], test[col]], axis=0)
    freq_map = combined.value_counts(normalize=False).to_dict()  # counts
    # freq_map = combined.value_counts(normalize=True).to_dict()  # proportions

    # Add new frequency-encoded column
    train[col + "_freq"] = train[col].map(freq_map)
    test[col + "_freq"] = test[col].map(freq_map)

In [None]:
train.columns

Index(['id', 'brand', 'model', 'model_year', 'milage', 'fuel_type', 'engine',
       'transmission', 'ext_col', 'int_col', 'accident', 'price', 'car_age',
       'engine_transmission', 'color_combo', 'model_ext_color', 'brand_model',
       'is_luxury_brand', 'is_regular_brand', 'is_luxury_model',
       'is_regular_model', 'high_milage_flag', 'model_decade',
       'luxury_model_high_milage', 'regular_model_high_milage',
       'luxury_brand_high_milage', 'regular_brand_high_milage',
       'luxury_model_decade', 'regular_model_decade', 'luxury_brand_decade',
       'regular_brand_decade', 'brand_freq', 'model_freq', 'fuel_type_freq',
       'engine_freq', 'transmission_freq', 'ext_col_freq', 'int_col_freq',
       'accident_freq', 'engine_transmission_freq', 'color_combo_freq',
       'model_ext_color_freq', 'brand_model_freq', 'luxury_model_decade_freq',
       'regular_model_decade_freq', 'luxury_brand_decade_freq',
       'regular_brand_decade_freq'],
      dtype='object')

In [None]:

import pandas as pd

# Identify all object columns in train (same assumed in test)
obj_cols = train.select_dtypes(include=['object']).columns

for col in obj_cols:
    # Combine train and test to ensure same mapping
    combined = pd.concat([train[col], test[col]], axis=0).astype('category')
    categories = combined.cat.categories

    # Apply consistent encoding
    train[col] = pd.Categorical(train[col], categories=categories).codes
    test[col] = pd.Categorical(test[col], categories=categories).codes

In [None]:
train.dtypes

Unnamed: 0,0
id,int64
brand,int8
model,int16
model_year,int64
milage,int64
fuel_type,int8
engine,int16
transmission,int8
ext_col,int16
int_col,int16


In [None]:
pip install catboost



In [None]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor

# ------------------------------
# Data Preparation
# ------------------------------
# Assuming `train` DataFrame is available
# Replace 'price' with your target
target = 'price'

# Split data into training and validation
X = train.drop(columns=['id', target])
y = train[target]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# ------------------------------
# LightGBM parameters and feature sets
# ------------------------------
lgb_params = {
    'subsample': 0.8,
    'reg_lambda': 0.1,
    'reg_alpha': 0.1,
    'num_leaves': 50,
    'n_estimators': 800,
    'max_depth': 5,
    'learning_rate': 0.01,
    'colsample_bytree': 0.7,
    'objective': 'regression',
    'metric': 'rmse',
    'verbose': -1,
    'random_state': 42
}

lgb_features_1 = ['milage', 'engine', 'brand_model', 'car_age', 'transmission_freq',
                  'color_combo', 'is_luxury_brand', 'regular_brand_decade']
lgb_features_2 = ['milage', 'engine', 'brand', 'car_age', 'transmission_freq',
                  'regular_brand_decade', 'ext_col', 'is_regular_brand', 'model']

# ------------------------------
# CatBoost parameters and feature set
# ------------------------------
cat_params = {
    'loss_function': 'RMSE',
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 500,
    'subsample': 0.8,
    'colsample_bylevel': 0.8,
    'random_state': 42,
    'verbose': False,
    'early_stopping_rounds': 50,
    'eval_metric': 'RMSE'
}

cat_features = ['milage', 'engine', 'regular_brand_decade', 'brand_freq',
                'transmission', 'ext_col', 'car_age']

# ------------------------------
# Train models
# ------------------------------
# LightGBM models
lgb_model_1 = lgb.LGBMRegressor(**lgb_params)
lgb_model_1.fit(X_train[lgb_features_1], y_train)
pred_lgb_1 = lgb_model_1.predict(X_val[lgb_features_1])
rmse_lgb_1 = np.sqrt(mean_squared_error(y_val, pred_lgb_1))

lgb_model_2 = lgb.LGBMRegressor(**lgb_params)
lgb_model_2.fit(X_train[lgb_features_2], y_train)
pred_lgb_2 = lgb_model_2.predict(X_val[lgb_features_2])
rmse_lgb_2 = np.sqrt(mean_squared_error(y_val, pred_lgb_2))

# CatBoost model
cat_model = CatBoostRegressor(**cat_params)
cat_model.fit(X_train[cat_features], y_train, eval_set=(X_val[cat_features], y_val))
pred_cat = cat_model.predict(X_val[cat_features])
rmse_cat = np.sqrt(mean_squared_error(y_val, pred_cat))

print(f"RMSE LightGBM 1: {rmse_lgb_1:.4f}")
print(f"RMSE LightGBM 2: {rmse_lgb_2:.4f}")
print(f"RMSE CatBoost : {rmse_cat:.4f}")

# ------------------------------
# Ensemble function (inverse RMSE weighting)
# ------------------------------
def weighted_ensemble(predictions, rmses):
    weights = 1 / np.array(rmses)
    weights /= weights.sum()
    ensemble_pred = np.sum([w * p for w, p in zip(weights, predictions)], axis=0)
    ensemble_rmse = np.sqrt(mean_squared_error(y_val, ensemble_pred))
    return ensemble_pred, ensemble_rmse, weights

# ------------------------------
# Ensemble 1: LGB1 + Cat
# ------------------------------
ensemble_pred_1, ensemble_rmse_1, weights_1 = weighted_ensemble(
    [pred_lgb_1, pred_cat],
    [rmse_lgb_1, rmse_cat]
)
print(f"\nEnsemble 1 (LGB1+Cat) RMSE: {ensemble_rmse_1:.4f}, Weights: {weights_1}")

# ------------------------------
# Ensemble 2: LGB2 + Cat
# ------------------------------
ensemble_pred_2, ensemble_rmse_2, weights_2 = weighted_ensemble(
    [pred_lgb_2, pred_cat],
    [rmse_lgb_2, rmse_cat]
)
print(f"\nEnsemble 2 (LGB2+Cat) RMSE: {ensemble_rmse_2:.4f}, Weights: {weights_2}")

# ------------------------------
# Ensemble 3: LGB1 + LGB2 + Cat
# ------------------------------
ensemble_pred_3, ensemble_rmse_3, weights_3 = weighted_ensemble(
    [pred_lgb_1, pred_lgb_2, pred_cat],
    [rmse_lgb_1, rmse_lgb_2, rmse_cat]
)
print(f"\nEnsemble 3 (LGB1+LGB2+Cat) RMSE: {ensemble_rmse_3:.4f}, Weights: {weights_3}")

RMSE LightGBM 1: 67776.3715
RMSE LightGBM 2: 67775.9463
RMSE CatBoost : 67814.9731

Ensemble 1 (LGB1+Cat) RMSE: 67716.8413, Weights: [0.50014235 0.49985765]

Ensemble 2 (LGB2+Cat) RMSE: 67718.9142, Weights: [0.50014391 0.49985609]

Ensemble 3 (LGB1+LGB2+Cat) RMSE: 67711.6133, Weights: [0.33339589 0.33339799 0.33320612]


In [None]:
import os
import numpy as np
import pandas as pd

# ------------------------------
# Predict on Test Set for each model
# ------------------------------
X_test_lgb1 = test[lgb_features_1]
X_test_lgb2 = test[lgb_features_2]
X_test_cat  = test[cat_features]

test_pred_lgb1 = lgb_model_1.predict(X_test_lgb1)
test_pred_lgb2 = lgb_model_2.predict(X_test_lgb2)
test_pred_cat  = cat_model.predict(X_test_cat)

# ------------------------------
# Weighted Ensemble function
# ------------------------------
def weighted_ensemble_test(predictions, rmses):
    weights = 1 / np.array(rmses)
    weights /= weights.sum()
    ensemble_pred = np.sum([w * p for w, p in zip(weights, predictions)], axis=0)
    return ensemble_pred

# ------------------------------
# Create output directory
# ------------------------------
output_dir = '/content/drive/MyDrive/Data Science Nigeria Hackathon/submission'
os.makedirs(output_dir, exist_ok=True)

# ------------------------------
# Ensemble 1: LGB1 + Cat
# ------------------------------
ensemble_pred_1 = weighted_ensemble_test([test_pred_lgb1, test_pred_cat],
                                        [rmse_lgb_1, rmse_cat])
pd.DataFrame({'id': test['id'], 'Price': ensemble_pred_1}) \
  .to_csv(f'{output_dir}/submission_ensemble_lgb1_cat.csv', index=False)

# ------------------------------
# Ensemble 2: LGB2 + Cat
# ------------------------------
ensemble_pred_2 = weighted_ensemble_test([test_pred_lgb2, test_pred_cat],
                                        [rmse_lgb_2, rmse_cat])
pd.DataFrame({'id': test['id'], 'Price': ensemble_pred_2}) \
  .to_csv(f'{output_dir}/submission_ensemble_lgb2_cat.csv', index=False)

# ------------------------------
# Ensemble 3: LGB1 + LGB2 + Cat
# ------------------------------
ensemble_pred_3 = weighted_ensemble_test([test_pred_lgb1, test_pred_lgb2, test_pred_cat],
                                        [rmse_lgb_1, rmse_lgb_2, rmse_cat])
pd.DataFrame({'id': test['id'], 'Price': ensemble_pred_3}) \
  .to_csv(f'{output_dir}/submission_ensemble_lgb1_lgb2_cat.csv', index=False)

print("✅ All three ensemble predictions saved successfully!")

✅ All three ensemble predictions saved successfully!
