In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures, LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import datetime
from re import findall, search, match
%matplotlib inline

In [3]:
df = pd.read_csv('./data/Dubizzle_used_car_sales.csv')
df.head(2)

Unnamed: 0,title,price_in_aed,kilometers,body_condition,mechanical_condition,seller_type,body_type,no_of_cylinders,transmission_type,regional_specs,horsepower,fuel_type,steering_side,year,color,emirate,motors_trim,company,model,date_posted
0,MITSUBISHI PAJERO 3.5L / 2013,26000,167390,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,Unknown,Gasoline,Left Hand Side,2013.0,Silver,Dubai,GLS,mitsubishi,pajero,13/05/2022
1,chevrolet silverado,110000,39000,Perfect inside and out,Perfect inside and out,Dealer,SUV,8,Automatic Transmission,North American Specs,400 - 500 HP,Gasoline,Left Hand Side,2018.0,White,Sharjah,1500 High Country,chevrolet,silverado,14/01/2022


In [4]:
df.info(), df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9970 entries, 0 to 9969
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   title                 9965 non-null   object 
 1   price_in_aed          9970 non-null   int64  
 2   kilometers            9970 non-null   int64  
 3   body_condition        9970 non-null   object 
 4   mechanical_condition  9970 non-null   object 
 5   seller_type           9970 non-null   object 
 6   body_type             9970 non-null   object 
 7   no_of_cylinders       9889 non-null   object 
 8   transmission_type     9970 non-null   object 
 9   regional_specs        9970 non-null   object 
 10  horsepower            9970 non-null   object 
 11  fuel_type             9970 non-null   object 
 12  steering_side         9970 non-null   object 
 13  year                  9000 non-null   float64
 14  color                 9970 non-null   object 
 15  emirate              

(None,
        price_in_aed    kilometers         year
 count  9.970000e+03  9.970000e+03  9000.000000
 mean   2.480496e+05  9.894896e+04  2015.626444
 std    4.536549e+05  1.154300e+06     4.735991
 min    6.000000e+03  0.000000e+00  1953.000000
 25%    5.100000e+04  2.314350e+04  2014.000000
 50%    1.040000e+05  7.600000e+04  2016.000000
 75%    2.340000e+05  1.330828e+05  2019.000000
 max    1.499900e+07  1.150000e+08  2021.000000)

--------------------
## 결측치 확인

In [5]:
for i in df.columns:
    print(f"{i} = {df[i].isnull().sum()}")

title = 5
price_in_aed = 0
kilometers = 0
body_condition = 0
mechanical_condition = 0
seller_type = 0
body_type = 0
no_of_cylinders = 81
transmission_type = 0
regional_specs = 0
horsepower = 0
fuel_type = 0
steering_side = 0
year = 970
color = 0
emirate = 0
motors_trim = 28
company = 0
model = 0
date_posted = 0


In [6]:
non_na_like = ['unknown', 'Unknown', 'NONE', 'none', 'null', 'Null', 'NA', 'na', 'N/A', 'n/a', 'nan', 'NaN', "NAN", '-', '', ' ']


def ch_null():
    result = {}

    for col in df.columns:
        col_data = df[col]
        # NaN 
        na_count = col_data.isnull().sum()
        # 의미상 결측치 
        unknown_count = col_data.astype(str).str.strip().isin(non_na_like).sum()
        result[col] = {
            'na_count': na_count,
            'non_na_like_count': unknown_count
        }

    summary = pd.DataFrame(result).T
    print(summary)

ch_null()

                      na_count  non_na_like_count
title                        5                  5
price_in_aed                 0                  0
kilometers                   0                  0
body_condition               0                  0
mechanical_condition         0                  0
seller_type                  0                  0
body_type                    0                  0
no_of_cylinders             81                138
transmission_type            0                  0
regional_specs               0                  0
horsepower                   0                814
fuel_type                    0                  0
steering_side                0                  0
year                       970                970
color                        0                  0
emirate                      0                  0
motors_trim                 28                 28
company                      0                  0
model                        0                  0


In [7]:
def to_none(data):
    if data in non_na_like:
        return None
    else:
        return data

In [8]:
for i in df.columns:
    df[i] = df[i].apply(to_none)

In [9]:
ch_null()

                      na_count  non_na_like_count
title                        5                  5
price_in_aed                 0                  0
kilometers                   0                  0
body_condition               0                  0
mechanical_condition         0                  0
seller_type                  0                  0
body_type                    0                  0
no_of_cylinders            138                 81
transmission_type            0                  0
regional_specs               0                  0
horsepower                 814                  0
fuel_type                    0                  0
steering_side                0                  0
year                       970                970
color                        0                  0
emirate                      0                  0
motors_trim                 28                 28
company                      0                  0
model                        0                  0


-------------
## year 결측치 처리
year 의 결측치가 제일 많기 때문에 먼저 처리.   
삭제하기 전에 title 에 해당 연도가 작성 되어있는 경우가 있으니 타이틀에서 년도를 가져와 채워 넣는다.   
그래도 안되는 부분은 삭제.

In [10]:
def find_year(row):
    existing_year = row['year']
    title_text = str(row['title'])

    if pd.isna(existing_year) or existing_year in non_na_like:
        found_years = findall(r'(19[5-9][0-9]|20[0-1][0-9]|202[0-4]|2025)', title_text)
        
        if found_years:
            return float(found_years[0])
        else:
            return None
    else:
        return existing_year

df["year"] = df.apply(find_year, axis=1)

In [11]:
df[df["year"].isna()]

Unnamed: 0,title,price_in_aed,kilometers,body_condition,mechanical_condition,seller_type,body_type,no_of_cylinders,transmission_type,regional_specs,horsepower,fuel_type,steering_side,year,color,emirate,motors_trim,company,model,date_posted
394,Toyota Land Cruiser GXR 3.3 Twin Turbo,340000,0,Perfect inside and out,Perfect inside and out,Owner,SUV,6,Automatic Transmission,GCC Specs,150 - 200 HP,Diesel,Left Hand Side,,White,Dubai,GXR,toyota,land-cruiser,13/05/2022
441,A4 Advanced 35 TFSI 150hp (Ref#06147) *AVAILAB...,175000,5000,Perfect inside and out,Perfect inside and out,Dealership/Certified Pre-Owned,Sedan,4,Automatic Transmission,GCC Specs,150 - 200 HP,Gasoline,Left Hand Side,,Grey,Abu Dhabi,Other,audi,a4,02/03/2022
466,volkswagen id4,115000,0,Perfect inside and out,Perfect inside and out,Dealer,Crossover,,Automatic Transmission,Other,,Electric,Left Hand Side,,Grey,Dubai,Crozz,volkswagen,id4,11/05/2022
508,Brand New BMW 760Li M Class Dealer warranty GCC,539000,10,Perfect inside and out,Perfect inside and out,Dealer,Sedan,12,Automatic Transmission,GCC Specs,500 - 600 HP,Gasoline,Left Hand Side,,Blue,Dubai,760Li xDrive,bmw,7-series,15/01/2022
536,A4 Advanced 35 TFSI 150hp (Ref#06062) *AVAILAB...,175000,5000,Perfect inside and out,Perfect inside and out,Dealership/Certified Pre-Owned,Sedan,4,Automatic Transmission,GCC Specs,150 - 200 HP,Gasoline,Left Hand Side,,White,Abu Dhabi,Other,audi,a4,17/03/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9390,LEXUS LX 600 F SPORT LAUNCH EDITION 1/100,650000,100,Perfect inside and out,Perfect inside and out,Owner,SUV,6,Automatic Transmission,GCC Specs,400 - 500 HP,Gasoline,Left Hand Side,,Silver,Dubai,,lexus,lx600,13/05/2022
9473,Panamera 4 EHybrid. Aventurine Green/ Satin wh...,675000,2000,Perfect inside and out,Perfect inside and out,Owner,Sedan,6,Automatic Transmission,GCC Specs,400 - 500 HP,Hybrid,Left Hand Side,,Green,Dubai,Other,porsche,panamera,07/05/2022
9682,Lotus Evora GT,399000,5345,Perfect inside and out,Perfect inside and out,Dealership/Certified Pre-Owned,Coupe,6,Automatic Transmission,GCC Specs,500 - 600 HP,Gasoline,Left Hand Side,,Green,Dubai,Other,lotus,evora,14/03/2022
9735,Mercedes G63 Amg Gcc Gergash Warranty And serv...,999000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,8,Automatic Transmission,GCC Specs,500 - 600 HP,Gasoline,Left Hand Side,,Green,Dubai,G 63 AMG,mercedes-benz,g-class,08/04/2022


------------
## moters_trim 결측치 처리
### title에서 moters_trim 의 값이 포함된 경우도 존재하기 때문에 title에서 가져와 결측치 채우기

### 확인 결과:   렉서스 LX600 모델이 제일 많다 하지만 LX600 의 트림은 타이틀에 존재하기 때문에 채워넣을 수 있다.

In [12]:
df[df["motors_trim"].isnull()]

Unnamed: 0,title,price_in_aed,kilometers,body_condition,mechanical_condition,seller_type,body_type,no_of_cylinders,transmission_type,regional_specs,horsepower,fuel_type,steering_side,year,color,emirate,motors_trim,company,model,date_posted
494,Lexus LX600 F Sport 2022 0km GCC,739000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,,Gasoline,Left Hand Side,2022.0,Silver,Dubai,,lexus,lx600,04/04/2022
657,2022 LX600 VIP Launch Edition DEALER WARRANTY,729000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,400 - 500 HP,Gasoline,Left Hand Side,2022.0,Grey,Dubai,,lexus,lx600,05/05/2022
671,(LHD) Lexus LX600 F Sport 3.5P AT MY2022 Blac...,800000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,Other,400 - 500 HP,Gasoline,Left Hand Side,2022.0,Black,Dubai,,lexus,lx600,22/04/2022
937,Brand new Lexus LX600 signature,650000,100,Perfect inside and out,Perfect inside and out,Owner,SUV,6,Automatic Transmission,GCC Specs,400 - 500 HP,Gasoline,Left Hand Side,,Other Color,Sharjah,,lexus,lx600,27/04/2022
956,Lexus LX600 VIP Launch Edition 2022 - For Export,769000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,8,Automatic Transmission,GCC Specs,500 - 600 HP,Gasoline,Left Hand Side,2022.0,Gold,Dubai,,lexus,lx600,25/04/2022
1025,2022 Mercedes Benz GLE 53 AMG 4Matic+ | Brand ...,589000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,400 - 500 HP,Gasoline,Left Hand Side,2022.0,White,Dubai,,mercedes-benz,gle-class,25/01/2022
1200,BRAND NEW - LEXUS LX 600 - F SPORT - 2022- GCC...,735000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,400 - 500 HP,Gasoline,Left Hand Side,2022.0,White,Dubai,,lexus,lx600,25/04/2022
1374,LX600 3.5L VIP KURO (GRANDEUR+) 4-SEATER*EXPOR...,950000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,400 - 500 HP,Gasoline,Left Hand Side,,Black,Dubai,,lexus,lx600,30/03/2022
2378,2022 Lexus LX600 Signature/Sonic Titanium / GC...,680000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,400 - 500 HP,Gasoline,Left Hand Side,2022.0,Silver,Dubai,,lexus,lx600,15/04/2022
2494,"2022 Model,Gcc , Under Warranty and Contract S...",695000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,5,Automatic Transmission,GCC Specs,150 - 200 HP,Gasoline,Left Hand Side,2022.0,Grey,Dubai,,lexus,lx600,19/03/2022


In [13]:
def lx(row):
    if not row["model"] == "lx600":
        return row["motors_trim"]
    
    if not type(row["title"]) == str:
        return row["motors_trim"]
    
    title_text = row["title"].lower()
    if pd.isna(row["motors_trim"]):
        m_trim = findall(r'(standard|premium|sport|vip|signature|f-sport)', title_text)

        if m_trim:
            return "".join(m_trim)
        else:
            return None
    else:
        return row["motors_trim"]

In [14]:
df["motors_trim"] = df.apply(lx, axis=1)

In [15]:
ch_null()

                      na_count  non_na_like_count
title                        5                  5
price_in_aed                 0                  0
kilometers                   0                  0
body_condition               0                  0
mechanical_condition         0                  0
seller_type                  0                  0
body_type                    0                  0
no_of_cylinders            138                 81
transmission_type            0                  0
regional_specs               0                  0
horsepower                 814                  0
fuel_type                    0                  0
steering_side                0                  0
year                       164                164
color                        0                  0
emirate                      0                  0
motors_trim                  7                  4
company                      0                  0
model                        0                  0


----------
## 실린더와 마력

In [16]:
ch_null()

                      na_count  non_na_like_count
title                        5                  5
price_in_aed                 0                  0
kilometers                   0                  0
body_condition               0                  0
mechanical_condition         0                  0
seller_type                  0                  0
body_type                    0                  0
no_of_cylinders            138                 81
transmission_type            0                  0
regional_specs               0                  0
horsepower                 814                  0
fuel_type                    0                  0
steering_side                0                  0
year                       164                164
color                        0                  0
emirate                      0                  0
motors_trim                  7                  4
company                      0                  0
model                        0                  0


In [None]:
df[["no_of_cylinders", "horsepower"]]

Unnamed: 0,title,price_in_aed,kilometers,body_condition,mechanical_condition,seller_type,body_type,no_of_cylinders,transmission_type,regional_specs,horsepower,fuel_type,steering_side,year,color,emirate,motors_trim,company,model,date_posted
164,Brand New Renault Zoe - Full Electric - REG/I...,120000,0,Perfect inside and out,Perfect inside and out,Dealer,Hatchback,,Automatic Transmission,GCC Specs,Less than 150 HP,Electric,Left Hand Side,2020.0,White,Dubai,Other,renault,other,14/12/2021
204,Honda pilot,25000,220000,Perfect inside and out,Perfect inside and out,Owner,SUV,,Automatic Transmission,GCC Specs,,Gasoline,Left Hand Side,2011.0,Burgundy,Dubai,Other,honda,pilot,07/05/2022
337,Polestar 2 SRSM Barley Colored Nappa Leather P...,159650,12,Perfect inside and out,Perfect inside and out,Dealer,SUV,,Automatic Transmission,Other,300 - 400 HP,Electric,Left Hand Side,2022.0,White,Dubai,Other,polestar,2,12/05/2022
417,2022 VOLKSWAGEN ID.6 CROZZ PRO LONG RANGE RWD ...,185999,9,Perfect inside and out,Perfect inside and out,Dealer,SUV,,Automatic Transmission,European Specs,,Electric,Left Hand Side,2022.0,Blue,Dubai,Other,volkswagen,id6,12/05/2022
437,Toyota Coaster 2014 Gcc Engine Diesel 30 Seats,77000,87000,Perfect inside and out,Perfect inside and out,Dealer,Van,,Manual Transmission,GCC Specs,,Diesel,Left Hand Side,2014.0,White,Dubai,Other,toyota,other,23/03/2022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8978,Audi Q5 - Low Milage - S-LINE - METALLIC GREY ...,100000,66000,"A bit of wear & tear, all repaired","Minor faults, all fixed",Owner,SUV,,Automatic Transmission,GCC Specs,,Gasoline,Left Hand Side,2016.0,Grey,Dubai,S-line,audi,q5,11/05/2022
9295,Volkswagen ID.4 X - PURE PLUS - WHT - 2yr Gold...,139000,41,Perfect inside and out,Perfect inside and out,Dealer,Crossover,,Automatic Transmission,Other,200 - 300 HP,Electric,Left Hand Side,,White,Dubai,Other,volkswagen,id4,28/04/2022
9392,PURE+CROZZ SKY VIEW ROOF LONG RANGE 555KMS,112000,0,Perfect inside and out,Perfect inside and out,Dealer,Crossover,,Automatic Transmission,GCC Specs,200 - 300 HP,Electric,Left Hand Side,2021.0,Blue,Dubai,Crozz,volkswagen,id4,06/04/2022
9465,S560 Coupe,380000,69000,"No accidents, very few faults",Perfect inside and out,Owner,Coupe,,Automatic Transmission,GCC Specs,,Gasoline,Left Hand Side,2018.0,White,Dubai,S 560 Coupe,mercedes-benz,s-class-coupe,26/04/2022


--------------
### 타이틀이 너무 난잡하고 뒤의 정보와 겹치는 부분이 많기 때문에 제거

In [None]:
df = df.drop("title", axis=1)

--------------
## date_post 타입변경
### date_posted 가 string 형식이기 때문에 date 형식으로 변경   

In [None]:
def chdt(data):
    return datetime.datetime.strptime(data.strftime("%Y-%m-%d"), "%Y-%m-%d")

In [None]:
df["date_posted"] = pd.to_datetime(df["date_posted"])
df["date_posted"] = df["date_posted"].apply(chdt)

  df["date_posted"] = pd.to_datetime(df["date_posted"])


In [None]:
df.head(2)

Unnamed: 0,price_in_aed,kilometers,body_condition,mechanical_condition,seller_type,body_type,no_of_cylinders,transmission_type,regional_specs,horsepower,fuel_type,steering_side,year,color,emirate,motors_trim,company,model,date_posted
0,26000,167390,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,,Gasoline,Left Hand Side,2013.0,Silver,Dubai,GLS,mitsubishi,pajero,2022-05-13
1,110000,39000,Perfect inside and out,Perfect inside and out,Dealer,SUV,8,Automatic Transmission,North American Specs,400 - 500 HP,Gasoline,Left Hand Side,2018.0,White,Sharjah,1500 High Country,chevrolet,silverado,2022-01-14


### 이상치 확인

1. 중고 판매 게시일 보다 차량 제조년도가 더 늦은 경우 존재 -> 삭제

In [None]:
def asd(data):
    if str(data)[:-2].isnumeric():
        data = str(data)[:-2] if len(str(data)) > 4 else str(data)
        return datetime.datetime.strptime(data, "%Y")
    else:
        return None


df[df["year"].apply(asd).dt.year > df["date_posted"].dt.year]

Unnamed: 0,price_in_aed,kilometers,body_condition,mechanical_condition,seller_type,body_type,no_of_cylinders,transmission_type,regional_specs,horsepower,fuel_type,steering_side,year,color,emirate,motors_trim,company,model,date_posted
30,1090000,0,Perfect inside and out,Perfect inside and out,Dealer,Wagon,8,Automatic Transmission,GCC Specs,500 - 600 HP,Gasoline,Left Hand Side,2022.0,White,Dubai,G 63 AMG,mercedes-benz,g-class,2021-12-18
282,1595000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,12,Automatic Transmission,European Specs,600 - 700 HP,Gasoline,Left Hand Side,2022.0,Other Color,Dubai,Other,maybach,other,2021-11-27
325,520000,70,Perfect inside and out,Perfect inside and out,Dealer,Sedan,8,Automatic Transmission,GCC Specs,400 - 500 HP,Gasoline,Left Hand Side,2022.0,White,Dubai,750Li xDrive,bmw,7-series,2021-12-08
769,36500,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,4,Automatic Transmission,Other,Less than 150 HP,Gasoline,Left Hand Side,2022.0,Grey,Dubai,GLX,suzuki,dzire,2021-08-22
794,1990000,100,Perfect inside and out,Perfect inside and out,Dealer,Sedan,12,Automatic Transmission,GCC Specs,500 - 600 HP,Gasoline,Left Hand Side,2025.0,Black,Dubai,Standard,rolls-royce,ghost,2022-04-23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9452,357000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,Other,400 - 500 HP,Diesel,Left Hand Side,2022.0,White,Dubai,Other,toyota,land-cruiser,2021-12-11
9716,1395000,0,Perfect inside and out,Perfect inside and out,Dealer,Sedan,12,Automatic Transmission,GCC Specs,600 - 700 HP,Gasoline,Left Hand Side,2022.0,White,Dubai,Other,mercedes-benz,s-class,2021-12-27
9807,729000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,8,Automatic Transmission,European Specs,500 - 600 HP,Gasoline,Left Hand Side,2023.0,Grey,Abu Dhabi,110,land-rover,defender,2022-04-19
9963,79000,25,Perfect inside and out,Perfect inside and out,Dealer,SUV,4,Automatic Transmission,GCC Specs,150 - 200 HP,Gasoline,Left Hand Side,2022.0,Brown,Dubai,Comfort,jetour,x70,2021-10-20


In [None]:
df = df.drop(index = df[df["year"].apply(asd).dt.year > df["date_posted"].dt.year].index, axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9905 entries, 0 to 9969
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   price_in_aed          9905 non-null   int64         
 1   kilometers            9905 non-null   int64         
 2   body_condition        9905 non-null   object        
 3   mechanical_condition  9905 non-null   object        
 4   seller_type           9905 non-null   object        
 5   body_type             9905 non-null   object        
 6   no_of_cylinders       9767 non-null   object        
 7   transmission_type     9905 non-null   object        
 8   regional_specs        9905 non-null   object        
 9   horsepower            9091 non-null   object        
 10  fuel_type             9905 non-null   object        
 11  steering_side         9905 non-null   object        
 12  year                  9741 non-null   float64       
 13  color                 9

### 그리고 년월일 컬럼으로 각각 분리

In [None]:
df["year_posted"] = df["date_posted"].dt.year
df["month_posted"] = df["date_posted"].dt.month
df["day_posted"] = df["date_posted"].dt.day
df = df.drop("date_posted", axis=1)
df

Unnamed: 0,price_in_aed,kilometers,body_condition,mechanical_condition,seller_type,body_type,no_of_cylinders,transmission_type,regional_specs,horsepower,...,steering_side,year,color,emirate,motors_trim,company,model,year_posted,month_posted,day_posted
0,26000,167390,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,,...,Left Hand Side,2013.0,Silver,Dubai,GLS,mitsubishi,pajero,2022,5,13
1,110000,39000,Perfect inside and out,Perfect inside and out,Dealer,SUV,8,Automatic Transmission,North American Specs,400 - 500 HP,...,Left Hand Side,2018.0,White,Sharjah,1500 High Country,chevrolet,silverado,2022,1,14
2,78000,200000,Perfect inside and out,Perfect inside and out,Dealer,Sedan,6,Automatic Transmission,GCC Specs,400 - 500 HP,...,Left Hand Side,2014.0,Blue,Sharjah,E 300,mercedes-benz,e-class,2022,5,5
3,899000,27000,Perfect inside and out,Perfect inside and out,Dealer,Hard Top Convertible,8,Automatic Transmission,GCC Specs,600 - 700 HP,...,Left Hand Side,2018.0,Red,Dubai,Standard,ferrari,488-spider,2022,4,30
4,33000,69000,Perfect inside and out,Perfect inside and out,Owner,Wagon,4,Manual Transmission,GCC Specs,Less than 150 HP,...,Left Hand Side,2020.0,White,Dubai,Standard,renault,dokker,2022,5,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9964,225000,0,Perfect inside and out,Perfect inside and out,Dealer,SUV,6,Automatic Transmission,GCC Specs,200 - 300 HP,...,Left Hand Side,2021.0,White,Dubai,Limited,jeep,grand-cherokee,2022,2,1
9965,215000,105777,Perfect inside and out,Perfect inside and out,Dealer,SUV,8,Automatic Transmission,GCC Specs,500 - 600 HP,...,Left Hand Side,2015.0,White,Dubai,Autobiography,land-rover,range-rover,2021,11,18
9966,90000,55640,Perfect inside and out,Perfect inside and out,Owner,SUV,8,Automatic Transmission,GCC Specs,400 - 500 HP,...,Left Hand Side,2014.0,White,Dubai,Platinum,cadillac,escalade,2022,5,11
9968,18900,140000,"No accidents, very few faults",Perfect inside and out,Owner,Sedan,4,Automatic Transmission,GCC Specs,150 - 200 HP,...,Left Hand Side,2013.0,White,Dubai,LS,chevrolet,malibu,2022,3,6


------------
## 범주형 변수 원핫 인코딩
### 범주의 변수의 값이 전부 string 으로 되어있기 때문에 int 형으로 변환

In [None]:
# import re

# complier = re.compile("[a-zA-Z]")

# le = LabelEncoder()

# for i in df.columns:
#     if complier.search(str(df[i].iloc[0])):
#         df[i] = le.fit_transform(df[i])

# df