In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import re
import lightgbm as lgb
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/used-car-dataset/sample_submission.csv
/kaggle/input/used-car-dataset/train.csv
/kaggle/input/used-car-dataset/test.csv


In [3]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

df_train = pd.read_csv('/kaggle/input/used-car-dataset/train.csv')
df_test = pd.read_csv('/kaggle/input/used-car-dataset/test.csv')

In [4]:
### Drop ID column
df_train.drop(columns='id', inplace=True)
df_test.drop(columns='id', inplace=True)


### Clean the 'Accident' column
df_train.loc[df_train['accident'] == 'None reported', 'accidentbool'] = False
df_train.loc[df_train['accident'] == 'At least 1 accident or damage reported', 'accidentbool'] = True
df_train.drop(columns='accident', inplace = True)

df_test.loc[df_test['accident'] == 'None reported', 'accidentbool'] = False
df_test.loc[df_test['accident'] == 'At least 1 accident or damage reported', 'accidentbool'] = True
df_test.drop(columns='accident', inplace = True)


### Clean the 'clean_title' column
df_train.loc[df_train['clean_title'] == 'Yes', 'cleanbool'] = True
df_train.drop(columns='clean_title', inplace = True)

df_test.loc[df_test['clean_title'] == 'Yes', 'cleanbool'] = True
df_test.drop(columns='clean_title', inplace = True)



############ Clean the 'Engine' column
def hp_extract(row):
    if 'HP' in row:
        hpower, rest = row.split('HP', 1)
        return pd.Series([hpower.strip(), rest.strip()])
    else:
        return pd.Series([np.nan, row])

def litters_extract(row):
    pattern = r'(\d+(?:\.\d+)?L)'  # Regex to capture only the liter value (integer or decimal)
    match = re.search(pattern, row)  # Search for the pattern in the row
    
    if match:  # If a match is found
        litrage = match.group(1).strip()  # Extract the matched liter value (e.g., '4.75L' or '2L')
        rest = row.replace(litrage, '').strip()  # Remove the matched liter value from the row
        return pd.Series([litrage, rest])  # Return the extracted liter and remaining text
    else:
        return pd.Series([np.nan, row])  # If no match, return NaN for litrage

def litters_extract2(row):
    if 'Liter' in row:
        hpower, rest = row.split('Liter', 1)

        return pd.Series([hpower.strip(), rest.strip()])
    else:
        return pd.Series([np.nan, row])
    
def get_fuel(fuel, row):
    if fuel in row:
        eng = row.replace(fuel, '').strip()
        return eng
    else:
        return row  
                      
    
# Keep this in order! Bc working in order stripping the texts
df_train[['horse_power', 'engine']] = df_train['engine'].apply(hp_extract)
df_train[['litrage', 'engine']] = df_train['engine'].apply(litters_extract)
df_train[['litrage2', 'engine']] = df_train['engine'].apply(litters_extract2)

df_train['litrage'] = df_train['litrage'].astype(str).str.rstrip('L')
df_train['litrage'] = df_train['litrage'].astype(float)
df_train['litrage2'] = df_train['litrage2'].astype(float)

df_train['litrage'] = df_train['litrage'].fillna(df_train['litrage2'])
df_train.drop(columns='litrage2', inplace = True)
#############################################################################

# Now, stripping the fuel type. There is already a 'fuel' column!

fueltp_list = ['Gasoline Fuel', 'Gasoline/Mild Electric Hybrid', 'Diesel Fuel', 'Gas/Electric Hybrid', 'Flex Fuel Capability',
              'Flexible Fuel', 'Diesel', 'Plug-In Electric/Gas', 'Hybrid']

for fuel in fueltp_list:
    df_train['engine'] = df_train['engine'].apply(lambda row: get_fuel(fuel, row))

## Change type of horse_power and litrage
df_train['horse_power'] = df_train['horse_power'].astype(float)
df_train['litrage'] = df_train['litrage'].astype(float)



# Keep this in order! Bc working in order stripping the texts
df_test[['horse_power', 'engine']] = df_test['engine'].apply(hp_extract)
df_test[['litrage', 'engine']] = df_test['engine'].apply(litters_extract)
df_test[['litrage2', 'engine']] = df_test['engine'].apply(litters_extract2)

df_test['litrage'] = df_test['litrage'].astype(str).str.rstrip('L')
df_test['litrage'] = df_test['litrage'].astype(float)
df_test['litrage2'] = df_test['litrage2'].astype(float)

df_test['litrage'] = df_test['litrage'].fillna(df_test['litrage2'])
df_test.drop(columns='litrage2', inplace = True)
#############################################################################

# Now, stripping the fuel type. There is already a 'fuel' column!

fueltp_list = ['Gasoline Fuel', 'Gasoline/Mild Electric Hybrid', 'Diesel Fuel', 'Gas/Electric Hybrid', 'Flex Fuel Capability',
              'Flexible Fuel', 'Diesel', 'Plug-In Electric/Gas', 'Hybrid']

for fuel in fueltp_list:
    df_test['engine'] = df_test['engine'].apply(lambda row: get_fuel(fuel, row))

## Change type of horse_power and litrage
df_test['horse_power'] = df_test['horse_power'].astype(float)
df_test['litrage'] = df_test['litrage'].astype(float)


In [5]:
brands_list = df_train['brand'].unique()
results = []
for brand in brands_list:

    bavgprice = df_train.loc[df_train['brand'] == brand, 'price'].mean()
    
    results.append({'brand': brand, 'avg_price': bavgprice})

avg_price_df = pd.DataFrame(results)

avg_price_df.loc[avg_price_df['avg_price'] > 0, 'brand_range'] = '1'
avg_price_df.loc[avg_price_df['avg_price'] > 26000, 'brand_range'] = '2'
avg_price_df.loc[avg_price_df['avg_price'] > 40000, 'brand_range'] = '3'
avg_price_df.loc[avg_price_df['avg_price'] > 50000, 'brand_range'] = '4'
avg_price_df.loc[avg_price_df['avg_price'] > 70000, 'brand_range'] = '5'



avg_price_df.sort_values(by='avg_price', ascending=False)

def funct(row):
    aux = avg_price_df.loc[avg_price_df['brand'] == row, 'brand_range'].values[0]
    return aux

df_train['brand_type'] = df_train['brand'].apply(funct)
df_test['brand_type'] = df_test['brand'].apply(funct)

In [6]:
df_train

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,price,accidentbool,cleanbool,horse_power,litrage,brand_type
0,MINI,Cooper S Base,2007,213000,Gasoline,4 Cylinder Engine,A/T,Yellow,Gray,4200,False,True,172.0,1.6,1
1,Lincoln,LS V8,2002,143250,Gasoline,8 Cylinder Engine,A/T,Silver,Beige,4999,True,True,252.0,3.9,2
2,Chevrolet,Silverado 2500 LT,2002,136731,E85 Flex Fuel,8 Cylinder Engine,A/T,Blue,Gray,13900,False,True,320.0,5.3,3
3,Genesis,G90 5.0 Ultimate,2017,19500,Gasoline,8 Cylinder Engine,Transmission w/Dual Shift Mode,Black,Black,45000,False,True,420.0,5.0,4
4,Mercedes-Benz,Metris Base,2021,7388,Gasoline,4 Cylinder Engine,7-Speed A/T,Black,Beige,97500,False,True,208.0,2.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188528,Cadillac,Escalade ESV Platinum,2017,49000,Gasoline,8 Cylinder Engine,Transmission w/Dual Shift Mode,White,Beige,27500,False,True,420.0,6.2,3
188529,Mercedes-Benz,AMG C 43 AMG C 43 4MATIC,2018,28600,Gasoline,V6 Cylinder Engine,8-Speed A/T,White,Black,30000,True,True,385.0,3.0,4
188530,Mercedes-Benz,AMG GLC 63 Base 4MATIC,2021,13650,Gasoline,8 Cylinder Engine,7-Speed A/T,White,Black,86900,False,True,469.0,4.0,4
188531,Audi,S5 3.0T Prestige,2022,13895,Gasoline,,1-Speed Automatic,Daytona Gray Pearl Effect,Black,84900,False,,,3.0,3


In [15]:
# Define the Tables
X = df_train
X = X.drop(columns='price')
y = df_train.price

# Define the columns that are going to be used
numerical_cols = ['model_year', 'milage', 'horse_power', 'litrage']
ordinal_cols  = ['brand_type']
onehot_cols  = ['fuel_type', 'cleanbool', 'accidentbool']

# Separate test and validation of the df_train dataframe
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

# Defining strategies for numerical and categorical columns
numerical_transformer = SimpleImputer(strategy='mean')

onehot_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder())
])

# Bundle the preprocessor

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('onehot', onehot_transformer, onehot_cols),
        ('ordinal', ordinal_transformer, ordinal_cols)
    ])

# Modeling



# Hiperparametre optimizasyonu için Grid Search


param_grid = {
    'regressor__num_leaves': [550],  # İlk num_leaves değeri 550 idi
    'regressor__max_depth': [12],  # İlk max_depth değeri 12 idi
    'regressor__learning_rate': [0.025],  # İlk learning_rate değeri 0.025 idi
    'regressor__n_estimators': [500],  # İlk n_estimators değeri 500 idi
    'regressor__min_child_samples': [130],  # İlk min_data_in_leaf değeri 130 idi
    'regressor__subsample': [1.0],  # İlk subsample değeri 1.0 idi
    'regressor__colsample_bytree': [1.0],  # İlk colsample_bytree değeri 1.0 idi
    'regressor__min_split_gain': [0.0],  # min_split_gain, ilk parametrelerde yoktu, varsayılan değeri kullanabiliriz
    'regressor__reg_alpha': [10.0],  # İlk reg_alpha değeri 10.0 idi
    'regressor__reg_lambda': [0.021544346900318832],  # İlk reg_lambda değeri
    'regressor__scale_pos_weight': [1],  # scale_pos_weight, ilk parametrelerde yoktu, varsayılan değeri kullanabiliriz
    'regressor__max_bin': [255]  # max_bin, ilk parametrelerde yoktu, varsayılan değeri kullanabiliriz
}


model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', lgb.LGBMRegressor(random_state=42))
])

In [16]:
model.fit(X_train, y_train)
preds = model.predict(X_valid)
print(np.sqrt(mean_squared_error(preds, y_valid)))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.017602 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 593
[LightGBM] [Info] Number of data points in the train set: 150826, number of used features: 13
[LightGBM] [Info] Start training from score 43831.242989
69348.01008273374


In [17]:
## Run model and make it spit the submissioN!
X = df_test
prediccion = model.predict(X)

submission = pd.DataFrame()
temp = pd.read_csv('/kaggle/input/used-car-dataset/sample_submission.csv')
submission['id'] = temp['id']
submission['price'] = prediccion
submission.to_csv('submission4.csv',index=False)


In [1]:
df_train

NameError: name 'df_train' is not defined