In [234]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import pandas_profiling
import seaborn as sns
import random as rn
import os
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from collections import Counter
from pycaret.regression import *

In [235]:
print("numpy version: {}". format(np.__version__))
print("pandas version: {}". format(pd.__version__))
print("matplotlib version: {}". format(matplotlib.__version__))
print("scikit-learn version: {}". format(sklearn.__version__))

numpy version: 1.19.5
pandas version: 1.1.5
matplotlib version: 3.3.4
scikit-learn version: 0.23.2


In [236]:
# reproducibility
seed_num = 42 
np.random.seed(seed_num)
rn.seed(seed_num)
os.environ['PYTHONHASHSEED']=str(seed_num)


In [237]:
train = pd.read_csv('../data/user_data/train.csv')
test = pd.read_csv('../data/user_data/test.csv')

print(train.shape)  # (1015, 11)
print(train.head())


(1015, 11)
   id                          title  odometer location    isimported  \
0   0                   Toyota RAV 4     18277   Lagos   Foreign Used   
1   1            Toyota Land Cruiser        10    Lagos          New    
2   2  Land Rover Range Rover Evoque     83091    Lagos  Foreign Used   
3   3                   Lexus ES 350     91524    Lagos  Foreign Used   
4   4                   Toyota Venza     94177    Lagos  Foreign Used   

           engine transmission    fuel  paint  year    target  
0  4-cylinder(I4)    automatic  petrol    Red  2016  13665000  
1  4-cylinder(I4)    automatic  petrol  Black  2019  33015000  
2  6-cylinder(V6)    automatic  petrol    Red  2012   9915000  
3  4-cylinder(I4)    automatic  petrol   Gray  2007   3815000  
4  6-cylinder(V6)    automatic  petrol    Red  2010   7385000  


In [238]:
pr = train.profile_report()
pr.to_file('../data/user_data/pr_report.html')
print(pr)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]




In [239]:
train['company'] = train['title'].apply(lambda x:x.split()[0])
test['company'] = test['title'].apply(lambda x:x.split()[0])

In [240]:
train = train.replace({
    'Mercedes-Benz/52' : 'Mercedes-Benz',
    })

In [241]:
# brand_list = train[['brand', 'target']].groupby(['brand'], as_index=False).mean().sort_values(by='target', ascending=True,ignore_index=True)
# brand_list

In [242]:
# for i,br in enumerate(brand_list.brand):
#     train = train.replace({
#     br : 10*i,
#     })
    
#     test = test.replace({
#     br : 10*i,
#     'Fiat' : 0,
#     })

In [243]:
# train[['brand', 'target']].groupby(['brand'], as_index=False).mean().sort_values(by='target', ascending=True,ignore_index=True)

In [244]:
import re 

def clean_text(texts): 
    corpus = [] 
    for i in range(0, len(texts)): 
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"\n\]\[\>\<]', '',texts[i]) #@%*=()/+ 와 같은 문장부호 제거
        review = re.sub(r'\d+','',review)#숫자 제거
        review = review.lower() #소문자 변환
        review = re.sub(r'\s+', ' ', review) #extra space 제거
        review = re.sub(r'<[^>]+>','',review) #Html tags 제거
        review = re.sub(r'\s+', ' ', review) #spaces 제거
        review = re.sub(r"^\s+", '', review) #space from start 제거
        review = re.sub(r'\s+$', '', review) #space from the end 제거
        review = re.sub(r'_', ' ', review) #space from the end 제거
        #review = re.sub(r'l', '', review)
        corpus.append(review) 
        
    return corpus


print('정제 전 paint의 unique 카테고리 개수 : ', len(train['paint'].unique()))
temp = clean_text(train['paint']) #메소드 적용
train['paint'] = temp
print('정제 후 paint의 unique 카테고리 개수 : ', len(train['paint'].unique()))

map_list = {i : i for i in np.unique(temp)}

tmp_map = {'off white l':'off white',
            'redl': 'red',
            'gray': 'grey',
            'gery': 'grey',
            'skye blue':'sky blue',
            'sliver':'silver',
            'whine':'white'}

for k in tmp_map.keys():
  v = tmp_map[k]
  map_list[k] = v

train['paint'] = train['paint'].map(map_list)
print(np.unique(train['paint']))


print('정제 전 paint의 unique 카테고리 개수 : ', len(test['paint'].unique()))

temp2 = clean_text(test['paint']) #메소드 적용
test['paint'] = temp2

print('정제 후 paint의 unique 카테고리 개수 : ', len(test['paint'].unique()))

정제 전 paint의 unique 카테고리 개수 :  76
정제 후 paint의 unique 카테고리 개수 :  51
['ash' 'ash and black' 'beige' 'black' 'black and silver'
 'black sand pearl' 'blackred' 'blue' 'brown' 'cream' 'dark ash'
 'dark blue' 'dark gray' 'dark green' 'dark grey' 'dark silver'
 'deep blue' 'gold' 'green' 'grey' 'ink blue' 'light blue' 'light gold'
 'light silver' 'magnetic gray' 'magnetic gray metallic' 'maroon'
 'midnight black metal' 'milk' 'navy blue' 'off white' 'orange'
 'pale brown' 'purple' 'red' 'silver' 'silvergrey' 'sky blue'
 'super white' 'white' 'white orchild pearl' 'whiteblue' 'wine' 'yellow']
정제 전 paint의 unique 카테고리 개수 :  46
정제 후 paint의 unique 카테고리 개수 :  30


In [245]:

def clean_text2(texts): 
    corpus = [] 
    for i in range(0, len(texts)): 
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"\n\]\[\>\<]', '',texts[i]) #@%*=()/+ 와 같은 문장부호 제거
        review = review.lower() #소문자 변환
        review = re.sub(r'\s+', ' ', review) #extra space 제거
        review = re.sub(r'<[^>]+>','',review) #Html tags 제거
        review = re.sub(r'\s+', ' ', review) #spaces 제거
        review = re.sub(r"^\s+", '', review) #space from start 제거
        review = re.sub(r'\s+$', '', review) #space from the end 제거
        review = re.sub(r'_', ' ', review) #space from the end 제거
        #review = re.sub(r'l', '', review)
        corpus.append(review) 
        
    return corpus

temp_t = clean_text2(train['title']) #메소드 적용
train['title'] = temp_t

temp_t2 = clean_text2(test['title']) #메소드 적용
test['title'] = temp_t2

In [246]:
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'blue' if x.find('blue') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'red' if x.find('red') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'green' if x.find('green') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'white' if x.find('white') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'grey' if x.find('grey') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'grey' if x.find('gery') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'grey' if x.find('gray') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'ash' if x.find('ash') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'brown' if x.find('brown') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'silver' if x.find('silver') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'silver' if x.find('sliver') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'black' if x.find('black') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'gold' if x.find('gold') >= 0 else x)
train['paint'] = train['paint'] = train['paint'].apply(lambda x : 'wine' if x.find('whine') >= 0 else x)

train['paint'].value_counts()
print('paint의 unique 카테고리 개수 : ', len(train['paint'].value_counts()))
print(train.paint.unique())


paint의 unique 카테고리 개수 :  18
['red' 'black' 'grey' 'white' 'blue' 'silver' 'brown' 'green' 'purple'
 'gold' 'milk' 'beige' 'ash' 'orange' 'cream' 'yellow' 'wine' 'maroon']


In [247]:
test_paint = clean_text(test['paint'])
test['paint'] = test_paint
print('test data에서 paint의 unique 카테고리 개수 : ', len(test['paint'].unique()))

test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'blue' if x.find('blue') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'red' if x.find('red') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'green' if x.find('green') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'white' if x.find('white') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'grey' if x.find('grey') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'grey' if x.find('gery') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'grey' if x.find('gray') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'ash' if x.find('ash') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'brown' if x.find('brown') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'silver' if x.find('silver') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'silver' if x.find('sliver') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'black' if x.find('black') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'black' if x.find('blac') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'gold' if x.find('gold') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'gold' if x.find('golf') >= 0 else x)
test['paint'] = test['paint'] = test['paint'].apply(lambda x : 'wine' if x.find('whine') >= 0 else x)

print(test.paint.unique())


test data에서 paint의 unique 카테고리 개수 :  30
['white' 'black' 'grey' 'red' 'silver' 'blue' 'gold' 'green' 'cream'
 'brown' 'yellow' 'maroon' 'wine' 'ash' 'indigo ink pearl' 'beige']


In [248]:
train = train.replace({
    'milk' : 'cream',
    'maroon' : 'red',
    'wine' : 'red',
})

test = test.replace({
    'indigo ink pearl' : 'blue',
    'golf' : 'green',
    'maroon' : 'red',
    'wine' : 'red',
})

In [249]:
train['location'] = train['location'].replace({
    'Abia State' : 'Abia',
    'Abuja ' : 'Abuja',
    'Lagos ' : 'Lagos',
    'Lagos State' : 'Lagos',
    'Ogun State' : 'Ogun'
    })

test['location'] = test['location'].replace({
    'Abuja ' : 'Abuja',
    'Lagos ' : 'Lagos',
    'Lagos State' : 'Lagos',
    'Ogun State' : 'Ogun',
    'Arepo ogun state ' : 'Ogun'
    # Arepo is a populated place located in Ogun State, Nigeria. 출처. 위키백과
})

train['location'] = train['location'].replace({
    'Accra' : 'other',
    'Adamawa ' : 'other',
    'FCT' : 'other',
    'Mushin' : 'other'
})

print(train.location.unique())

test['location'] = test['location'].replace({
    'Accra' : 'other',
    'Adamawa ' : 'other',
    'FCT' : 'other',
    'Mushin' : 'other'
})

print(test.location.unique())


['Lagos' 'Abuja' 'Ogun' 'other' 'Abia']
['Abuja' 'Lagos' 'Ogun' 'other' 'Abia']


In [250]:
test[(test.engine == '12-cylinder(V12)')]

Unnamed: 0,id,title,odometer,location,isimported,engine,transmission,fuel,paint,year,company
142,142,mercedesbenz maybach,6758,Lagos,New,12-cylinder(V12),automatic,petrol,black,2019,Mercedes-Benz


In [251]:
engine_re = train[['engine', 'target']].groupby(['engine'], as_index=False).mean().sort_values(by='target', ascending=False,ignore_index=True)
engine_re

Unnamed: 0,engine,target
0,8-cylinder(V8),22800100.0
1,4-cylinder(H4),11390000.0
2,6-cylinder(V6),7989856.0
3,6-cylinder(I6),7841907.0
4,4-cylinder(I4),7294036.0
5,5-cylinder(I5),3815000.0
6,3-cylinder(I3),3138333.0
7,2-cylinder(I2),3015000.0


In [252]:
train = train.replace({
    '2-cylinder(I2)' : 10,
    '3-cylinder(I3)' : 20,
    '5-cylinder(I5)' : 30,
    '4-cylinder(I4)' : 40,
    '6-cylinder(I6)' : 50,
    '6-cylinder(V6)' : 60,
    '4-cylinder(H4)' : 70,
    '8-cylinder(V8)' : 80,
    '12-cylinder(V12)' : 90,
})

test = test.replace({
    '2-cylinder(I2)' : 10,
    '3-cylinder(I3)' : 20,
    '5-cylinder(I5)' : 30,
    '4-cylinder(I4)' : 40,
    '6-cylinder(I6)' : 50,
    '6-cylinder(V6)' : 60,
    '4-cylinder(H4)' : 70,
    '8-cylinder(V8)' : 80,
    '12-cylinder(V12)' : 90,
})

In [253]:
df_train = train.copy()
df_test = test.copy()

In [254]:
fig, ax = plt.subplots(1, 2, figsize=(18,5))
g = sns.histplot(df_train['odometer'], color='b', label='Skewness : {:.2f}'.format(df_train['odometer'].skew()), ax=ax[0])
g.legend(loc='best', prop={'size': 16})
g.set_xlabel("Odometer", fontsize = 16)
g.set_ylabel("Count", fontsize = 16)

g = sns.histplot(df_train['year'], color='b', label='Skewness : {:.2f}'.format(df_train['year'].skew()), ax=ax[1])
g.legend(loc='best', prop={'size': 16})
g.set_xlabel("Year", fontsize = 16)
g.set_ylabel("Count", fontsize = 16)
plt.show()

In [255]:
numeric_fts = ['odometer', 'year']
outlier_ind = []
for i in numeric_fts:
  Q1 = np.percentile(df_train[i],25)
  Q3 = np.percentile(df_train[i],75)
  IQR = Q3-Q1
  outlier_list = df_train[(df_train[i] < Q1 - IQR * 1.5) | (df_train[i] > Q3 + IQR * 1.5)].index
  outlier_ind.extend(outlier_list)
  
# Drop outliers
train_df = df_train.drop(outlier_ind, axis = 0).reset_index(drop = True)
print(train_df)


       id                          title  odometer location    isimported  \
0       0                   toyota rav 4     18277    Lagos  Foreign Used   
1       1            toyota land cruiser        10    Lagos          New    
2       2  land rover range rover evoque     83091    Lagos  Foreign Used   
3       3                   lexus es 350     91524    Lagos  Foreign Used   
4       4                   toyota venza     94177    Lagos  Foreign Used   
..    ...                            ...       ...      ...           ...   
970  1010                 toyota corolla     46768    Lagos  Foreign Used   
971  1011                   toyota camry     31600    Abuja  Foreign Used   
972  1012                   toyota camry     96802    Abuja  Foreign Used   
973  1013                   lexus gx 460    146275    Lagos  Foreign Used   
974  1014                         daf cf         0    Lagos  Locally used   

     engine transmission    fuel   paint  year    target company  
0       

In [256]:
fig, ax = plt.subplots(1, 2, figsize=(18,5))
g = sns.histplot(train_df['odometer'], color='b', label='Skewness : {:.2f}'.format(train_df['odometer'].skew()), ax=ax[0])
g.legend(loc='best', prop={'size': 16})
g.set_xlabel("Odometer", fontsize = 16)
g.set_ylabel("Count", fontsize = 16)

g = sns.histplot(train_df['year'], color='b', label='Skewness : {:.2f}'.format(train_df['year'].skew()), ax=ax[1])
g.legend(loc='best', prop={'size': 16})
g.set_xlabel("Year", fontsize = 16)
g.set_ylabel("Count", fontsize = 16)
plt.show()

print("# outliers to drop :", len(outlier_ind))


# outliers to drop : 44


In [257]:
cat_fts = ['title', 'location', 'isimported', 'transmission', 'fuel', 'paint']

la_train = train_df.copy()

for i in range(len(cat_fts)):
  encoder = LabelEncoder()
  la_train[cat_fts[i]] = encoder.fit_transform(la_train[cat_fts[i]])

plt.figure(figsize = (10,8))
sns.heatmap(la_train[['odometer', 'year', 'paint', 'fuel', 'transmission', 'target']].corr(), annot=True)
plt.show()

print(train_df['title'].unique()[:20])


['toyota rav 4' 'toyota land cruiser' 'land rover range rover evoque'
 'lexus es 350' 'toyota venza' 'toyota corolla'
 'land rover range rover sport' 'pontiac vibe' 'toyota tacoma'
 'lexus rx 350' 'ford escape' 'honda civic' 'volvo xc90' 'bmw 750'
 'infiniti jx' 'honda accord' 'mercedesbenz ml 350' 'toyota camry'
 'hyundai azera' 'lexus gx 460']


In [258]:
train_df['company'] = train_df['title'].apply(lambda x : x.split(" ")[0])
df_test['company'] = df_test['title'].apply(lambda x : x.split(" ")[0])

print(train_df['company'].unique())
print("#fts :", len(train_df['company'].unique()), '\n')
print(df_test['company'].unique())
print("#fts :", len(df_test['company'].unique()), '\n')


['toyota' 'land' 'lexus' 'pontiac' 'ford' 'honda' 'volvo' 'bmw' 'infiniti'
 'mercedesbenz' 'hyundai' 'jaguar' 'mitsubishi' 'nissan' 'chevrolet'
 'mazda' 'lincoln' 'kia' 'acura' 'daf' 'man' 'isuzu' 'ivm' 'porsche'
 'mini' 'gmc' 'iveco' 'scania' 'volkswagen' 'gac' 'mack' 'peugeot'
 'rollsroyce' 'manvolkswagen' 'jeep' 'alpina' 'bentley' 'jmc']
#fts : 38 

['mercedesbenz' 'honda' 'toyota' 'iveco' 'lexus' 'nissan' 'volkswagen'
 'jeep' 'ford' 'bmw' 'mack' 'land' 'hyundai' 'peugeot' 'volvo' 'infiniti'
 'acura' 'man' 'fiat' 'mini' 'daf' 'mazda' 'porsche' 'mitsubishi'
 'chevrolet' 'kia' 'pontiac' 'rollsroyce']
#fts : 28 



In [259]:
cat_fts2 = ['title', 'location', 'isimported', 'transmission', 'fuel', 'paint', 'company']

for i in range(len(cat_fts2)):
    print(cat_fts2[i], ":")
    print(train_df[cat_fts2[i]].unique())
    print("#fts :", len(train_df[cat_fts2[i]].unique()), '\n')
    
for i in range(len(cat_fts2)):
    print(cat_fts2[i], ":")
    print(df_test[cat_fts2[i]].unique())
    print("#fts :", len(df_test[cat_fts2[i]].unique()), '\n')
    

title :
['toyota rav 4' 'toyota land cruiser' 'land rover range rover evoque'
 'lexus es 350' 'toyota venza' 'toyota corolla'
 'land rover range rover sport' 'pontiac vibe' 'toyota tacoma'
 'lexus rx 350' 'ford escape' 'honda civic' 'volvo xc90' 'bmw 750'
 'infiniti jx' 'honda accord' 'mercedesbenz ml 350' 'toyota camry'
 'hyundai azera' 'lexus gx 460' 'bmw 325' 'toyota sienna' 'honda fit'
 'honda crv' 'hyundai tucson' 'jaguar xj8' 'bmw x6' 'mercedesbenz c 300'
 'mitsubishi galant' 'mercedesbenz gl 450' 'lexus rx 300'
 'toyota highlander' 'mitsubishi canter pick up' 'nissan titan'
 'lexus is 250' 'mercedesbenz 200' 'toyota sequoia' 'ford explorer'
 'hyundai ix35' 'lexus ct 200h' 'lexus lx 570' 'toyota avensis'
 'toyota 4runner' 'mercedesbenz gle 350' 'mercedesbenz e 300'
 'toyota avalon' 'chevrolet camaro' 'land rover range rover' 'mazda cx9'
 'lexus rx 330' 'lincoln mark' 'kia optima' 'lexus gs 300' 'jaguar xtype'
 'nissan altima' 'acura mdx' 'daf 95xf tractor head' 'man tga 18360'
 '

In [260]:
train_data = train_df.copy()
test_data = df_test.copy()


In [261]:
for i in range(len(cat_fts2)):
    onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)

    transformed = onehot_encoder.fit_transform(train_data[cat_fts2[i]].to_numpy().reshape(-1, 1))
    onehot_df = pd.DataFrame(transformed, columns=onehot_encoder.get_feature_names())
    train_data = pd.concat([train_data, onehot_df], axis=1).drop(cat_fts2[i], axis=1)

    test_transformed = onehot_encoder.transform(test_data[cat_fts2[i]].to_numpy().reshape(-1, 1))
    test_onehot_df = pd.DataFrame(test_transformed, columns=onehot_encoder.get_feature_names())
    test_data = pd.concat([test_data, test_onehot_df], axis=1).drop(cat_fts2[i], axis=1)
    
print(train_data.columns)
print(test_data.columns)

Index(['id', 'odometer', 'engine', 'year', 'target', 'x0_acura mdx',
       'x0_acura rdx', 'x0_acura tl', 'x0_acura zdx', 'x0_alpina b3',
       ...
       'x0_mitsubishi', 'x0_nissan', 'x0_peugeot', 'x0_pontiac', 'x0_porsche',
       'x0_rollsroyce', 'x0_scania', 'x0_toyota', 'x0_volkswagen', 'x0_volvo'],
      dtype='object', length=254)
Index(['id', 'odometer', 'engine', 'year', 'x0_acura mdx', 'x0_acura rdx',
       'x0_acura tl', 'x0_acura zdx', 'x0_alpina b3', 'x0_bentley arnage',
       ...
       'x0_mitsubishi', 'x0_nissan', 'x0_peugeot', 'x0_pontiac', 'x0_porsche',
       'x0_rollsroyce', 'x0_scania', 'x0_toyota', 'x0_volkswagen', 'x0_volvo'],
      dtype='object', length=253)


In [262]:
train_x = train_data.drop('id', axis = 1)
test_x = test_data.drop('id', axis = 1)

print(train_x.shape)
print(test_x.shape)

(975, 253)
(436, 252)


In [263]:
py_reg = setup(train_x, target = 'target', train_size= 0.8, normalize=True, fold=10, session_id = seed_num, silent = True)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Original Data,"(975, 253)"
3,Missing Values,False
4,Numeric Features,25
5,Categorical Features,227
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(780, 240)"


In [264]:
# reg_test_1 = setup(data=train_x,
#                    target='target',
#                    train_size= 0.85,
#                    #test_data=test,
#                    use_gpu =True,
#                    normalize=True,
#                    normalize_method ='minmax',
#                    remove_perfect_collinearity=False,
#                    numeric_features = ['engine','company'],
#                    fold=10,
#                    session_id=42
#                    )

In [265]:
best= compare_models(sort='mae',n_select=4)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,2065021.8698,27866044154600.81,4819178.3346,0.7714,0.4424,0.3008,4.102
xgboost,Extreme Gradient Boosting,2075287.1638,25589205625306.395,4578278.5338,0.7816,0.4331,0.3081,0.465
et,Extra Trees Regressor,2159531.5,32443907940899.105,5360210.1272,0.7148,0.396,0.2906,0.417
rf,Random Forest Regressor,2170781.7244,29569806042433.17,4992257.4094,0.7491,0.4003,0.2981,0.311
gbr,Gradient Boosting Regressor,2264247.7742,28733254765533.008,4904813.4699,0.7528,0.467,0.3818,0.161
knn,K Neighbors Regressor,2682300.2564,38883200685743.59,5616391.2336,0.7046,0.4563,0.395,0.04
dt,Decision Tree Regressor,2778325.641,47819206559615.375,6664276.6638,0.5191,0.5316,0.3763,0.031
lightgbm,Light Gradient Boosting Machine,3262469.5474,44127454521627.35,6363350.5957,0.582,0.6074,0.5194,0.221
omp,Orthogonal Matching Pursuit,3369356.5097,49352127669590.08,6589013.782,0.5777,0.7079,0.7196,0.021
ridge,Ridge Regression,3510284.8256,46262098448965.95,6363157.9979,0.6088,0.7912,0.8869,0.032


In [275]:
best

[<catboost.core.CatBoostRegressor at 0x23efa896b70>,
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=-1, num_parallel_tree=1,
              objective='reg:squarederror', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='auto',
              validate_parameters=1, verbosity=0),
 ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     max_samples=None, min_impurity_decrease=0.0,
                     min_impurity_split=None, min_samples_leaf=1,
                     min_samples_split=2,

In [267]:
top5 = [rank for rank in best]
top5_tune = [tune_model(i) for i in top5]

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2453338.3857,21817371552361.4,4670906.9304,0.83,0.5052,0.5208
1,4150759.393,117044542234803.84,10818712.596,0.5861,0.5882,0.5712
2,2130055.7552,14205953265007.982,3769078.5698,0.7177,0.4988,0.5
3,2125262.9463,11973261912086.283,3460240.1524,0.729,0.5223,0.4526
4,2412922.2616,24589030754043.414,4958732.777,0.5082,0.5381,0.4627
5,3157443.8979,33476134640203.71,5785856.431,0.4971,0.5281,0.5323
6,3508301.1746,73685302110120.0,8584014.3354,0.3493,0.5407,0.4503
7,3187728.711,35260724753345.14,5938074.162,0.821,0.5472,0.5569
8,2461450.6405,20267776648896.816,4501974.7499,0.8495,0.4688,0.467
9,2470304.9658,15056861758560.21,3880317.224,0.8513,0.5771,0.6277


In [276]:
blended = blend_models(estimator_list = best,
                    #    fold = 10,
                       #method = 'soft',
                       optimize='mae',
                       )

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1997608.0504,16943420459683.623,4116238.6301,0.8679,0.3776,0.2892
1,3179158.8,95946084724359.97,9795207.2323,0.6607,0.4755,0.3757
2,1467349.735,8609112015364.326,2934128.8341,0.8289,0.3273,0.2659
3,1602662.3911,8471799469998.691,2910635.5784,0.8083,0.3718,0.2425
4,1613716.5396,15100785386939.68,3885972.901,0.698,0.3883,0.2503
5,2009678.582,20455804334522.285,4522809.3409,0.6927,0.3663,0.289
6,2594601.4785,51721588990092.305,7191772.3122,0.5433,0.3912,0.2598
7,2346899.6523,26235484639893.27,5122058.633,0.8668,0.353,0.2946
8,1750532.1292,12474582920240.414,3531937.5589,0.9074,0.3243,0.2321
9,1443337.4471,8563386406715.963,2926326.4354,0.9154,0.3617,0.285


In [277]:
final_model = finalize_model(blended)

In [270]:
# y_test = predict_model(final_model, data=test_x)
# y_test = np.exp(y_test['Label'])

In [272]:
# submission = pd.read_csv('../data/user_data/sample_submission.csv')
# submission['target'] = y_test
# submission.to_csv('../data/user_data/sub9.csv', index=False)

In [None]:
# compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,2167384.2352,32577863855704.1,5081371.3741,0.7422,0.4002,0.3132,3.933
rf,Random Forest Regressor,2224798.7929,31618152450649.53,5087524.5041,0.7281,0.389,0.3067,0.309
gbr,Gradient Boosting Regressor,2332614.6473,32772895531610.043,5140230.2258,0.7255,0.4297,0.3758,0.122
xgboost,Extreme Gradient Boosting,2224390.9716,33762363681125.44,5358558.7114,0.7067,0.4219,0.3068,0.427
et,Extra Trees Regressor,2217790.9859,35529568210110.016,5491148.9518,0.6999,0.3849,0.297,0.364
dt,Decision Tree Regressor,2617365.9527,34293081871531.332,5571575.6984,0.6631,0.4626,0.3648,0.027
ridge,Ridge Regression,3476651.7406,45274931951304.82,6213684.3402,0.6085,0.7348,0.8923,0.036
omp,Orthogonal Matching Pursuit,3370353.5537,46939994435857.93,6401461.1078,0.5687,0.7491,0.762,0.03
lasso,Lasso Regression,3560571.2962,52003048488354.38,6666893.4122,0.5355,0.8436,0.8952,0.033
lr,Linear Regression,3822091.1177,51663679033205.24,6727104.3767,0.5298,0.8887,0.9953,0.916


<catboost.core.CatBoostRegressor at 0x23eff1c67b8>

In [None]:
# catboost = create_model('catboost', verbose = False)
# rf = create_model('rf', verbose = False)
# gbr = create_model('gbr', verbose = False)


In [None]:
# blended_model = blend_models(estimator_list = [catboost, rf, gbr])

In [None]:
# final_model = finalize_model(blended_model)

In [278]:
prediction = predict_model(final_model, data = test_x)

In [279]:
pred = prediction['Label']

In [281]:
# 제출용 sample 파일을 불러옵니다.
submission = pd.read_csv('../data/user_data/sample_submission.csv')
submission.head()

# 위에서 구한 예측값을 그대로 넣어줍니다.
submission['target'] = pred

# 데이터가 잘 들어갔는지 확인합니다.
submission.head()

Unnamed: 0,id,target
0,0,16814200.0
1,1,5261335.0
2,2,7337631.0
3,3,1306946.0
4,4,2879170.0


In [283]:

submission.to_csv('../data/user_data/submit13.csv', index=False)