In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import *
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import RFE, RFECV
import warnings
import re
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('car_prices.csv', index_col = 0)
df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


### clean_title ustunini tashlab yuboramiz chunki bu ustunda bizga ma'no beradigan jihatlar yo'q

In [4]:
df = df.drop('clean_title', axis = 1)

### Avtomobillarning yillik bosib o'tgan masofalari uchun ham ustun yaratib olamiz

In [5]:
df['milage_per_year'] = df['milage'] / (2024 - df['model_year'])

### milage va price uchun qoshimcha ustunlar yaratib olamiz (bins)

In [6]:
milage_bins = [df['milage'].min(), df['milage'].quantile(1/3), df['milage'].quantile(2/3), df['milage'].max()]
price_bins = [df['price'].min(), df['price'].quantile(1/3), df['price'].quantile(2/3), df['price'].max()]

bin_labels = ['Low', 'Medium', 'High']

df['milage_bins'] = pd.cut(df['milage'], bins = milage_bins, labels=bin_labels, include_lowest=True)

In [7]:
df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,milage_per_year,milage_bins
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,12391.5,Medium
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250,4705.882353,Medium
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000,6099.4,High
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,63500,1218.5,Low
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,7850,4826.086957,High


### HP (Horse-Power) ustunini yasab olamiz

In [8]:
df['horse_power'] = df['engine'].apply(lambda x: int(x.split('.0HP')[0]) if x.split('.0HP')[0].isdigit() else np.nan)
df['horse_power'].isna().sum()

np.int64(4057)

In [9]:
mini2 = df.sample(1000)
mini2.to_csv('cars2.csv')

In [10]:
df['Car_Age'] = 2024 - df['model_year']

### Monotonic usul bilan brand ustunini numerical qilamiz

In [11]:
brand_monotonic = df.groupby('brand')['price'].mean()

df['brand_monotonic'] = df['brand'].map(brand_monotonic)

In [12]:
df.head(3)

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,milage_per_year,milage_bins,horse_power,Car_Age,brand_monotonic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,12391.5,Medium,375.0,6,38154.063227
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250,4705.882353,Medium,300.0,17,40276.029448
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000,6099.4,High,300.0,15,34840.403933


In [13]:
for i in df['transmission'].unique():
    print(i)

10-Speed A/T
6-Speed M/T
6-Speed A/T
Transmission w/Dual Shift Mode
A/T
5-Speed M/T
7-Speed A/T
5-Speed A/T
8-Speed A/T
Transmission Overdrive Switch
9-Speed Automatic
7-Speed M/T
10-Speed Automatic
6-Speed Automatic
M/T
5-Speed Automatic
CVT Transmission
9-Speed A/T
8-Speed Automatic
4-Speed A/T
Automatic
1-Speed A/T
8-Speed Automatic with Auto-Shift
7-Speed DCT Automatic
Automatic CVT
7-Speed Automatic
7-Speed Automatic with Auto-Shift
4-Speed Automatic
6-Speed Automatic with Auto-Shift
6-Speed Manual
7-Speed Manual
6-Speed Electronically Controlled Automatic with O
1-Speed Automatic
10-Speed Automatic with Overdrive
8-Speed Manual
2-Speed A/T
CVT-F
–
F
9-Speed Automatic with Auto-Shift
7-Speed
Variable
SCHEDULED FOR OR IN PRODUCTION
6-Speed
6 Speed At/Mt
6 Speed Mt


In [14]:
automatic = df['transmission'].str.contains('Automatic|A/T')
df['Automatic'] = automatic
df['Manual'] = ~automatic

In [15]:
df['horse_power'] = df['horse_power'].fillna(df.groupby('brand')['horse_power'].transform('median'))

In [16]:
df['horse_power'].isna().sum()

np.int64(1)

In [17]:
df['brand'].isna().sum()

np.int64(0)

In [18]:
global_median = df['horse_power'].median()
df['horse_power'] = df['horse_power'].fillna(global_median)

In [19]:
print(df['horse_power'].isna().sum())

0


In [20]:
df.head()

Unnamed: 0_level_0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,price,milage_per_year,milage_bins,horse_power,Car_Age,brand_monotonic,Automatic,Manual
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,11000,12391.5,Medium,375.0,6,38154.063227,True,False
1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,8250,4705.882353,Medium,300.0,17,40276.029448,False,True
2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,15000,6099.4,High,300.0,15,34840.403933,True,False
3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,63500,1218.5,Low,335.0,2,40276.029448,False,True
4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,7850,4826.086957,High,200.0,23,17526.060403,True,False


In [25]:
cols_for_encode = ['fuel_type', 'ext_col', 'int_col', 'accident', 'Automatic', 'Manual']
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

for col in df[cols_for_encode]:
    df[col] = label_encoder.fit_transform(df[col])

In [26]:
df.select_dtypes('number')

Unnamed: 0_level_0,model_year,milage,fuel_type,ext_col,int_col,accident,price,milage_per_year,horse_power,Car_Age,brand_monotonic,Automatic,Manual
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,2018,74349,2,26,57,1,11000,12391.500000,375.0,6,38154.063227,1,0
1,2007,80000,2,17,9,1,8250,4705.882353,300.0,17,40276.029448,0,1
2,2009,91491,2,181,6,1,15000,6099.400000,300.0,15,34840.403933,1,0
3,2022,2437,3,100,24,1,63500,1218.500000,335.0,2,40276.029448,0,1
4,2001,111000,2,249,9,1,7850,4826.086957,200.0,23,17526.060403,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54268,2017,29000,2,249,24,1,29000,4142.857143,445.0,7,40276.029448,1,0
54269,2015,94634,1,17,9,0,6500,10514.888889,220.0,9,37091.368241,1,0
54270,2013,40989,2,249,9,0,18950,3726.272727,420.0,11,63742.154930,0,1
54271,2023,1518,2,12,24,1,194965,1518.000000,375.0,1,63742.154930,1,0


In [27]:
df.to_csv("car_prices_new.csv")
