<a href="https://www.kaggle.com/code/isseimatsuzoe/regression-used-cars-issei?scriptVersionId=200416505" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
full_data = pd.read_csv('/kaggle/input/playground-series-s4e9/train.csv', index_col='id')
full_test_data = pd.read_csv('/kaggle/input/playground-series-s4e9/test.csv', index_col='id')

# EDA

In [None]:
#check the data has been roaded correctly
full_data.head(20)

**Note**
* null value in fuel_type can be obtained from engine
* break down the information of the engine into horse_power, litter, num_cylinder, and fuel_type

In [None]:
full_data.info()

In [None]:
full_data['model'].nunique()

# Missing Values

In [None]:
full_data.isnull().sum()

In [None]:
plt.subplot(1, 2, 1)
sns.heatmap(full_data.isnull())
plt.subplot(1, 2, 2)
sns.heatmap(full_test_data.isnull())

**Seems we don't need to drop the entire features, so let's fill them out.**

In [None]:
#Analyze the meaning of NaN value
full_data.clean_title.unique()

In [None]:
full_data.loc[full_data['clean_title'].isnull()].head(10)

NaN values in clean_title means "NO'.

In [None]:
full_data.fuel_type.unique()

In [None]:
sns.barplot(x=full_data.loc[full_data['clean_title'].isnull()]['accident'].value_counts().values,
            y=full_data.loc[full_data['clean_title'].isnull()]['accident'].value_counts().index,
            palette="viridis")

**accident and NaN values in clean_title is not corellated, so fill the feature with 'None reported'.**

In [None]:
null_index = full_data.loc[full_data.fuel_type.isnull()]
null_index[['brand', 'model', 'engine']].head(15)

**We assume NaN values in fuel _type means electreic vehicle**

In [None]:
full_data.loc[full_data['accident'].isnull()].head(10)

In [None]:
#Fill NaN values
full_data['clean_title'].fillna(value='No', inplace=True)
full_test_data['clean_title'].fillna(value='No', inplace=True)
full_data['fuel_type'].fillna(value='Electic', inplace=True)
full_test_data['fuel_type'].fillna(value='Electic', inplace=True)
full_data['accident'].fillna(value='None reported', inplace=True)
full_test_data['accident'].fillna(value='None reported', inplace=True)

# Data Preprocessing

**Separate engine feature into three feature, horsepower, litter, cylinder_count. ignore engine type since a lot of value does not include its information**

In [None]:
import re
def extract_engine_features(df):
    
    # Extract Horsepower
    df['horsepower'] = df['engine'].apply(lambda x: float(re.search(r'(\d+(\.\d+)?)HP', x).group(1)) if re.search(r'(\d+(\.\d+)?)HP', x) else None)

    # Extract Displacement
    df['litter'] = df['engine'].apply(lambda x: float(re.search(r'(\d+\.\d+)L|(\d+\.\d+) Liter', x).group(1)
                                            or re.search(r'(\d+\.\d+)L|(\d+\.\d+) Liter', x).group(2))
                                            if re.search(r'(\d+\.\d+)L|(\d+\.\d+) Liter', x) else None)

    # Extract Cylinder Count
    df['cylinder_count'] = df['engine'].apply(lambda x: int(re.search(r'(\d+) Cylinder', x).group(1)) if re.search(r'(\d+) Cylinder', x) else None)

    return df
extract_engine_features(full_data).head()
extract_engine_features(full_test_data)

**Drop unnecessary columns**

In [None]:
full_data.drop(['model', 'engine'], axis=1, inplace=True)
full_test_data.drop(['model', 'engine'], axis=1, inplace=True)

**Do imputation into missing values in new columns**

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

#Use iterative imputer since horsepower and litter might corellated
iter_imputer = IterativeImputer(max_iter=10, tol=1e-3, random_state=0)
full_data[['horsepower', 'litter']] = iter_imputer.fit_transform(full_data[['horsepower', 'litter']])
full_test_data[['horsepower', 'litter']] = iter_imputer.fit_transform(full_test_data[['horsepower', 'litter']])

#Use simple imputer for cylinder count
simple_imputer = SimpleImputer(strategy='mean')
full_data['cylinder_count'] = simple_imputer.fit_transform(full_data[['cylinder_count']])
full_test_data['cylinder_count'] = simple_imputer.transform(full_test_data[['cylinder_count']])

# Handling Categorical Columns

**Standardise transmission column**

In [None]:
def map_transmission(transmission):
    transmission = transmission.strip().lower()
    
    if any(kw in transmission for kw in ['a/t', 'automatic']):
        return 'Automatic'
    elif any(kw in transmission for kw in ['m/t', 'manual']):
        return 'Manual'
    elif any(kw in transmission for kw in ['cvt', 'variator']):
        return 'Variator'
    elif any(kw in transmission for kw in ['tiptronic']):
        return 'Tiptronic'
    else:
        return 'Other'

# Apply the function to the DataFrame
full_data['transmission'] = full_data['transmission'].apply(map_transmission)
full_test_data['transmission'] = full_test_data['transmission'].apply(map_transmission)

print(full_data['transmission'].value_counts())
print('='*20)
print(full_test_data['transmission'].value_counts())

In [None]:
full_data['accident'].unique()

In [None]:
def custom_imputer(df):
    df['accident'] = df['accident'].replace({
        'At least 1 accident or damage reported':1,
        'None reported':0
    })
    
    df['clean_title'] = df['clean_title'].replace({
        'Yes':1,
        'No':0
    })
#Apply the function
custom_imputer(full_data)
custom_imputer(full_test_data)

In [None]:
full_data.head()

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# Ordinal Encoding categorical variables
categorical_columns = ['brand', 'fuel_type', 'ext_col', 'int_col']
lb = LabelEncoder()

for col in categorical_columns:
    if col in full_data.columns:
        full_data[col] = lb.fit_transform(full_data[col])
        full_test_data[col] = lb.transform(full_test_data[col])
        
# OneHot Encoding
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)

encoded_data = pd.DataFrame(OH_encoder.fit_transform(full_data[['transmission']]))
encoded_test_data = pd.DataFrame(OH_encoder.transform(full_test_data[['transmission']]))

encoded_data.index = full_data.index #put index back
encoded_test_data.index = full_test_data.index

encoded_data.columns = encoded_data.columns.astype(str) #ensure the columns are all str
encoded_test_data.columns = encoded_test_data.columns.astype(str)

processed_full_data = pd.concat([full_data, encoded_data], axis=1).drop(['transmission'], axis=1)
processed_test_full_data = pd.concat([full_test_data, encoded_test_data], axis=1).drop(['transmission'], axis=1)

In [None]:
full_test_data.head()

# Fit a model (XGBRegressor)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

#Split the train data into train and valid
X = processed_full_data.drop(['price'], axis=1)
y = processed_full_data['price']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

primal_model = XGBRegressor()
primal_model.fit(X_train, y_train)

print('MAE: ', mean_absolute_error(primal_model.predict(X_valid), y_valid))


**Hyper Parameter Tuning**

In [None]:
model_1 = XGBRegressor(n_estimators=200,
                      n_jobs=4,
                      learning_rate=0.1,
                      max_depth=4)

evaluation_set = [(X_train, y_train), (X_valid, y_valid)]
model_1.fit(X_train, y_train,
           early_stopping_rounds=5,
           eval_set=evaluation_set,
           verbose=False)

print('MAE: ', mean_absolute_error(model_1.predict(X_valid), y_valid))


In [None]:
#Use hyperopt to tune
from sklearn.metrics import accuracy_score, mean_squared_error
from xgboost import XGBRegressor
from hyperopt import hp, Trials, tpe, fmin, STATUS_OK

#Initialize space
space={'max_depth': hp.quniform("max_depth", 6, 10, 1),
        'gamma': hp.uniform ('gamma', 0,5),
        'reg_alpha' : hp.quniform('reg_alpha', 0,5,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0,1),
        'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
        'n_estimators': 200,
        'seed': 0,
        'n_jobs': 4,
        'learning_rate' : 0.08
      }

rmse_list=[]

def objective(space):
    clf=XGBRegressor(n_estimators = int(space['n_estimators']),
                     max_depth = int(space['max_depth']),
                     gamma = space['gamma'],
                     reg_alpha = int(space['reg_alpha']),
                     min_child_weight=int(space['min_child_weight']),
                     colsample_bytree=int(space['colsample_bytree']),
                     learning_rate = space['learning_rate']
                    )
    evaluation_set = [(X_train, y_train), (X_valid, y_valid)]
    
    clf.fit(X_train, y_train,
           early_stopping_rounds=5,
           eval_set=evaluation_set,
           verbose=False)
    
    pred = clf.predict(X_valid)
    
    rmse = mean_squared_error(y_valid, pred, squared=False)
    accuracy = accuracy_score(y_valid, pred>0.5)
    #print("MAE:", mae)
    rmse_list.append(rmse)
    return {'loss': -accuracy, 'status': STATUS_OK }
    
trials = Trials()


best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 1, #100 is usual, to save time on saving the file, the value is set to 1.
                        trials = trials)

print('lowest RMSE is: ', min(rmse_list))

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)
print('with RMSE: ', min(rmse_list))

In [None]:
model_2 = XGBRegressor(colsample_bytree=0.359317272602521,
                       gamma=3.8173416444223367,
                       max_depth=7,
                       min_child_weight=6,
                       reg_alpha=2,
                       n_estimators=200,
                       n_jobs=4,
                       learning_rate=0.08,
                       seed=0)

model_2.fit(X_train, y_train,
           early_stopping_rounds=5,
           eval_set=evaluation_set,
           verbose=False)

print('MAE: ', mean_absolute_error(model_2.predict(X_valid), y_valid))

# Submit Final model

In [None]:
X_test = processed_test_full_data
preds = model_2.predict(X_test)

# Save test predictions to file
output = pd.DataFrame({'id': X_test.index,
                       'price': preds})
output.to_csv('submission.csv', index=False)