In [1]:
import pandas as pd
from  category_encoders import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

In [2]:
car_df = pd.read_csv("/Users/hanhyeongu/Section3/Project/data/car_list.csv", encoding='cp949', header=1)
car_df.head()

Unnamed: 0,모델명,제조사,유종,배기량,공차중량,변속형식,도심주행연비,고속도로주행연비,복합연비,등급,...,엔진형식,최고출력(ps/rpm),최고토크(kgm/rpm),연료공급방식,변속형식.1,굴림방식,타이어사이즈(전),타이어사이즈(후),연료탱크용량(ℓ),자동차유형
0,DS4 Crossback 1.5 BlueHDi,,경유,"1,499cc","1,475kg",자동8,14.9㎞/ℓ,18.1㎞/ℓ,16.2㎞/ℓ,1등급,...,YH01,"131/3,750","30.61/1,750",직접분사,자동8,FF,205/55R19,205/55R19,53.0,일반형
1,Q2 35 TDI,아우디,경유,"1,968cc","1,475kg",자동7,15.5㎞/ℓ,18.5㎞/ℓ,16.7㎞/ℓ,1등급,...,DTT,150/3000-4200,36.7/1600-2750,직접분사,자동7,FF,235/40R19,235/40R19,50.0,다목적형
2,아반떼 1.6GDI 하이브리드 (16“타이어)(22MY),현대,휘발유,"1,580cc","1,335kg",자동6,21.4㎞/ℓ,20.7㎞/ℓ,21.1㎞/ℓ,1등급,...,G4LE,105/5700,15.0/4000,직접분사,자동6,FF,205/55R16,205/55R16,42.0,일반형
3,아반떼 1.6GDI 하이브리드 (17“타이어)(22MY),현대,휘발유,"1,580cc","1,350kg",자동6,20.3㎞/ℓ,18.6㎞/ℓ,19.5㎞/ℓ,1등급,...,G4LE,105/5700,15.0/4000,직접분사,자동6,FF,225/45R17,225/45R17,42.0,일반형
4,Peugeot New 308 1.5 BlueHDi,,경유,"1,499cc","1,390kg",자동8,15.6㎞/ℓ,19.6㎞/ℓ,17.2㎞/ℓ,1등급,...,YH01,"131/3,750","30.61/1,750",직접분사,자동8,FF,225/40R18,225/40R18,53.0,일반형


In [3]:
car_df.columns

Index(['모델명', '제조사', '유종', '배기량', '공차중량', '변속형식', '도심주행연비', '고속도로주행연비', '복합연비',
       '등급', 'CO2배출량', '연비신고구분', '국산/수입', '자동차형식', '자동차종류', '승차정원', '엔진형식',
       '최고출력(ps/rpm)', '최고토크(kgm/rpm)', '연료공급방식', '변속형식.1', '굴림방식',
       '타이어사이즈(전)', '타이어사이즈(후)', '연료탱크용량(ℓ)', '자동차유형'],
      dtype='object')

In [4]:
na_index = car_df[car_df['제조사'].isna()].index
for i in na_index:
    if "Peugeot" in car_df.iloc[i]['모델명']  :
        car_df.loc[i, '제조사'] = '푸조'
    elif "Citroen" in car_df.iloc[i]['모델명']:
        car_df.loc[i, '제조사'] = '시트로앵'
    else:
        pass

car_df = car_df.dropna(axis=0)
car_df = car_df.drop_duplicates(['모델명'], keep='first')

def hasNumber(stringVal):
    return any(elem.isdigit() for elem in stringVal)

for i in car_df[car_df['CO2배출량'].apply(hasNumber)==False].index:
    car_df = car_df.drop([i], axis=0)
car_df = car_df.reset_index(drop=True)

def cc(data):
    data = data.str.replace(',','')
    data = data.str.replace('cc','')
    data = data.astype(int)
    return data

def kg(data):
    data = data.str.replace(',','')
    data = data.str.replace('kg','')
    data = data.astype(int)
    return data

def co2(data):
    data = data.str.replace('g/km', '')
    data = data.astype(float)
    return data

def km_l(data):
    if '㎞/ℓ' in data:
        data = data.replace('㎞/ℓ', '')
        data = float(data)
    elif 'km/kWh' in data:
        data = data.replace('km/kWh', '')
        data = float(data)
    return data

car_df['배기량'] = cc(car_df['배기량'])

car_df['공차중량'] = kg(car_df['공차중량'])

car_df['CO2배출량'] = co2(car_df['CO2배출량'])

car_df['복합연비'] = car_df['복합연비'].apply(km_l)

In [5]:
not_use_features = ['모델명', '변속형식', '도심주행연비', '고속도로주행연비', '연비신고구분', '승차정원', '엔진형식', '최고출력(ps/rpm)', '최고토크(kgm/rpm)', '연료공급방식','변속형식.1',
'굴림방식','타이어사이즈(전)','타이어사이즈(후)','연료탱크용량(ℓ)', '공차중량']
car_df = car_df.drop(not_use_features, axis=1)

In [6]:
target = 'CO2배출량'
X = car_df.drop([target], axis=1)
y = car_df[target]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [7]:
cat_features = []
for i in car_df.columns:
    if car_df[i].dtype == 'object':
        cat_features.append(i)

In [8]:
model = Pipeline([
    ('ordinal', OrdinalEncoder(cols=cat_features)),
    ('rf', RandomForestRegressor(n_estimators=200, max_depth=10,random_state=42))
])

model.fit(X_train, y_train)
pred = model.predict(X_test)


In [9]:
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)
print(mae, r2)

2.3908038456011353 0.9689255617006504


In [10]:
joblib.dump(model, './model.pkl')

['model.pkl']