In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import re
import lightgbm as lgb
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/used-car-dataset/sample_submission.csv
/kaggle/input/used-car-dataset/train.csv
/kaggle/input/used-car-dataset/test.csv


In [2]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

df_train = pd.read_csv('/kaggle/input/used-car-dataset/train.csv')
df_test = pd.read_csv('/kaggle/input/used-car-dataset/test.csv')

In [3]:
df_train['fuel_type'].fillna('special', inplace=True)
         

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['fuel_type'].fillna('special', inplace=True)


In [None]:
# Transform model year to car_age
df_train['car_age'] = 2024 - df_train['model_year'] 
df_test['car_age'] = 2024 - df_test['model_year'] 

df_train.drop(columns='model_year', inplace=True)
df_test.drop(columns='model_year', inplace=True)


### Drop ID column
df_train.drop(columns='id', inplace=True)
df_test.drop(columns='id', inplace=True)


### Clean the 'Accident' column
df_train.loc[df_train['accident'] == 'None reported', 'accidentbool'] = False
df_train.loc[df_train['accident'] == 'At least 1 accident or damage reported', 'accidentbool'] = True
df_train.drop(columns='accident', inplace = True)

df_test.loc[df_test['accident'] == 'None reported', 'accidentbool'] = False
df_test.loc[df_test['accident'] == 'At least 1 accident or damage reported', 'accidentbool'] = True
df_test.drop(columns='accident', inplace = True)


### Clean the 'clean_title' column
df_train.loc[df_train['clean_title'] == 'Yes', 'cleanbool'] = True
df_train.drop(columns='clean_title', inplace = True)

df_test.loc[df_test['clean_title'] == 'Yes', 'cleanbool'] = True
df_test.drop(columns='clean_title', inplace = True)



############ Clean the 'Engine' column
def hp_extract(row):
    if 'HP' in row:
        hpower, rest = row.split('HP', 1)
        return pd.Series([hpower.strip(), rest.strip()])
    else:
        return pd.Series([np.nan, row])

def litters_extract(row):
    pattern = r'(\d+(?:\.\d+)?L)'  # Regex to capture only the liter value (integer or decimal)
    match = re.search(pattern, row)  # Search for the pattern in the row
    
    if match:  # If a match is found
        litrage = match.group(1).strip()  # Extract the matched liter value (e.g., '4.75L' or '2L')
        rest = row.replace(litrage, '').strip()  # Remove the matched liter value from the row
        return pd.Series([litrage, rest])  # Return the extracted liter and remaining text
    else:
        return pd.Series([np.nan, row])  # If no match, return NaN for litrage

def litters_extract2(row):
    if 'Liter' in row:
        hpower, rest = row.split('Liter', 1)

        return pd.Series([hpower.strip(), rest.strip()])
    else:
        return pd.Series([np.nan, row])
    
def get_fuel(fuel, row):
    if fuel in row:
        eng = row.replace(fuel, '').strip()
        return eng
    else:
        return row  
                      
    
# Keep this in order! Bc working in order stripping the texts
df_train[['horse_power', 'engine']] = df_train['engine'].apply(hp_extract)
df_train[['litrage', 'engine']] = df_train['engine'].apply(litters_extract)
df_train[['litrage2', 'engine']] = df_train['engine'].apply(litters_extract2)

df_train['litrage'] = df_train['litrage'].astype(str).str.rstrip('L')
df_train['litrage'] = df_train['litrage'].astype(float)
df_train['litrage2'] = df_train['litrage2'].astype(float)

df_train['litrage'] = df_train['litrage'].fillna(df_train['litrage2'])
df_train.drop(columns='litrage2', inplace = True)
#############################################################################

# Now, stripping the fuel type. There is already a 'fuel' column!

fueltp_list = ['Gasoline Fuel', 'Gasoline/Mild Electric Hybrid', 'Diesel Fuel', 'Gas/Electric Hybrid', 'Flex Fuel Capability',
              'Flexible Fuel', 'Diesel', 'Plug-In Electric/Gas', 'Hybrid']

for fuel in fueltp_list:
    df_train['engine'] = df_train['engine'].apply(lambda row: get_fuel(fuel, row))

## Change type of horse_power and litrage
df_train['horse_power'] = df_train['horse_power'].astype(float)
df_train['litrage'] = df_train['litrage'].astype(float)



# Keep this in order! Bc working in order stripping the texts
df_test[['horse_power', 'engine']] = df_test['engine'].apply(hp_extract)
df_test[['litrage', 'engine']] = df_test['engine'].apply(litters_extract)
df_test[['litrage2', 'engine']] = df_test['engine'].apply(litters_extract2)

df_test['litrage'] = df_test['litrage'].astype(str).str.rstrip('L')
df_test['litrage'] = df_test['litrage'].astype(float)
df_test['litrage2'] = df_test['litrage2'].astype(float)

df_test['litrage'] = df_test['litrage'].fillna(df_test['litrage2'])
df_test.drop(columns='litrage2', inplace = True)
#############################################################################

# Now, stripping the fuel type. There is already a 'fuel' column!

fueltp_list = ['Gasoline Fuel', 'Gasoline/Mild Electric Hybrid', 'Diesel Fuel', 'Gas/Electric Hybrid', 'Flex Fuel Capability',
              'Flexible Fuel', 'Diesel', 'Plug-In Electric/Gas', 'Hybrid']

for fuel in fueltp_list:
    df_test['engine'] = df_test['engine'].apply(lambda row: get_fuel(fuel, row))

## Change type of horse_power and litrage
df_test['horse_power'] = df_test['horse_power'].astype(float)
df_test['litrage'] = df_test['litrage'].astype(float)


brands_list = df_train['brand'].unique()
results = []
for brand in brands_list:

    bavgprice = df_train.loc[df_train['brand'] == brand, 'price'].mean()
    
    results.append({'brand': brand, 'avg_price': bavgprice})

avg_price_df = pd.DataFrame(results)

avg_price_df.loc[avg_price_df['avg_price'] > 0, 'brand_range'] = '1'
avg_price_df.loc[avg_price_df['avg_price'] > 26000, 'brand_range'] = '2'
avg_price_df.loc[avg_price_df['avg_price'] > 40000, 'brand_range'] = '3'
avg_price_df.loc[avg_price_df['avg_price'] > 50000, 'brand_range'] = '4'
avg_price_df.loc[avg_price_df['avg_price'] > 70000, 'brand_range'] = '5'



avg_price_df.sort_values(by='avg_price', ascending=False)

def funct(row):
    aux = avg_price_df.loc[avg_price_df['brand'] == row, 'brand_range'].values[0]
    return aux

df_train['brand_type'] = df_train['brand'].apply(funct)
df_test['brand_type'] = df_test['brand'].apply(funct)


############# NEW CODE##################################################################################################################################

def extract_transmission(row):
    split_patterns = ['-Speed', '-SPEED', ' Speed']
    
    for pattern in split_patterns:
        if pattern in row:
            outp = row.split(pattern)
            return outp
    return [np.nan, row]

df_test[['speed', 'transrest']] = df_test['transmission'].apply(extract_transmission).apply(pd.Series)
df_train[['speed', 'transrest']] = df_train['transmission'].apply(extract_transmission).apply(pd.Series)
df_train.loc[df_train['speed'] == 'Single', 'speed'] = 1 ## "Single Speed -> 1 "
df_test.loc[df_train['speed'] == 'Single', 'speed'] = 1 ## "Single Speed -> 1 "


automatic = ['A/T', ' A/T',' Automatic', 'Automatic', ' AT', ' Automatic with Overdrive', ' DCT Automatic',]
manual = [' Manual', ' M/T', 'M/T', ' Mt', 'Manual']
nicer = ['Transmission w/Dual Shift Mode', '',
       ' Automatic with Auto-Shift', 'CVT Transmission',
       'Automatic CVT',
       'Transmission Overdrive Switch',
       ' Electronically Controlled Automatic with O', 'F', 'CVT-F', 
         '–', '2', ' At/Mt', '', 'Variable', ' Fixed Gear', 'SCHEDULED FOR OR IN PRODUCTION']
def classify_transmission(row):
    if row in automatic:
        return 3
    elif row in manual:
        return 2
    elif row in nicer:
        return 1
df_train['transrest'] = df_train['transrest'].apply(classify_transmission)
df_test['transrest'] = df_test['transrest'].apply(classify_transmission)


############################## Adjust the variable types
df_train['accidentbool'] = df_train['accidentbool'].astype('bool')
df_train['cleanbool'] = df_train['cleanbool'].astype('bool')
df_train['brand_type'] = df_train['brand_type'].astype('int')

df_test['accidentbool'] = df_test['accidentbool'].astype('bool')
df_test['cleanbool'] = df_test['cleanbool'].astype('bool')
df_test['brand_type'] = df_test['brand_type'].astype('int')

In [None]:
############################## Dropping columns not to use now
df_train.drop(columns=['fuel_type', 'brand', 'model', 'engine', 'transmission', 'ext_col', 'int_col', 'speed'], inplace=True)
df_test.drop(columns=['fuel_type', 'brand', 'model', 'engine', 'transmission', 'ext_col', 'int_col', 'speed'], inplace=True)

In [None]:
import optuna

import lightgbm as lgb

from sklearn.linear_model import Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

# Define the Tables
X = df_train
X = X.drop(columns='price')
y = df_train.price

# Separate test and validation of the df_train dataframe
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)



def evaluate_model(model, X_train, X_valid, y_train, y_valid, model_name):
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    model_scores.append((model_name, rmse))
    print(f"{model_name} RMSE: {rmse:.4f}")
    return rmse


model_scores = []
best_hyperparams = {}

def objective_lightgbm(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 1.0),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    return rmse

# Define the models and their respective objective functions
model_objectives = {
    "LightGBM": objective_lightgbm,
}

# Optimize each model with Optuna and store the results
for model_name, objective in model_objectives.items():
    print(f"Optimizing {model_name}...")
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100, show_progress_bar=True)
    
    #Store the best result
    model_scores.append((model_name, study.best_value))
    best_hyperparams[model_name] = study.best_params

# Print all model scores
for model_name, rmse in model_scores:
    print(f"{model_name}: RMSE = {rmse:.4f}")

# Optional: Convert model_scores to a DataFrame for better readability
scores_df = pd.DataFrame(model_scores, columns=['Model', 'RMSE']).sort_values(by='RMSE')
print(scores_df)

# Print the best hyperparameters for each model
for model_name, params in best_hyperparams.items():
    print(f"\nBest hyperparameters for {model_name}:")
    print(params)

In [None]:
params = {'n_estimators': 127, 'max_depth': 10, 'learning_rate': 0.04974279965108004, 
          'subsample': 0.9761778392845826, 'colsample_bytree': 0.5397069795902089, 
          'lambda_l2': 0.8140077440135021, 'lambda_l1': 0.2809474022561914, 'min_child_weight': 5}
model = lgb.LGBMRegressor(**params)
model.fit(X_train, y_train)


X = df_test
prediccion = model.predict(X)

submission = pd.DataFrame()
temp = pd.read_csv('/kaggle/input/used-car-dataset/sample_submission.csv')
submission['id'] = temp['id']
submission['price'] = prediccion
submission.to_csv('submission.csv',index=False)