In [83]:
import pandas as pd
import numpy as np
import regex as re
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostRegressor
from sklearn import metrics
from hyperopt import fmin,tpe,hp,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
pd.set_option('display.max_columns', None)

In [84]:
df = pd.read_csv('data/train.csv')

In [85]:
df.sample(10)

Unnamed: 0,id,laptop_ID,Company,Product,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price_euros
301,979,993,Asus,Rog GL753VE-DS74,Gaming,17.3,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,16GB,256GB SSD + 1TB HDD,Nvidia GeForce GTX 1050 Ti,Windows 10,2.99kg,1749.0
367,355,361,Dell,Inspiron 5570,Notebook,15.6,Full HD 1920x1080,Intel Core i7 8550U 1.8GHz,8GB,128GB SSD + 2TB HDD,AMD Radeon 530,Windows 10,2.02kg,970.9
39,372,378,Asus,Rog Strix,Gaming,17.3,IPS Panel Full HD 1920x1080,AMD Ryzen 1700 3GHz,16GB,256GB SSD + 1TB HDD,AMD Radeon RX 580,Windows 10,3.25kg,2199.0
41,151,154,Dell,Inspiron 7567,Gaming,15.6,Full HD 1920x1080,Intel Core i7 7700HQ 2.8GHz,8GB,1.0TB Hybrid,Nvidia GeForce GTX 1050,Windows 10,2.62kg,899.0
62,984,998,Toshiba,Tecra A40-C-1KF,Notebook,14.0,1366x768,Intel Core i5 6200U 2.3GHz,4GB,500GB HDD,Intel HD Graphics 520,Windows 10,1.75kg,915.0
410,1180,1198,Lenovo,ThinkPad X1,2 in 1 Convertible,14.0,IPS Panel Touchscreen 2560x1440,Intel Core i5 6200U 2.3GHz,8GB,256GB SSD,Intel HD Graphics 520,Windows 10,1.36kg,1637.0
85,218,223,Dell,Latitude 5490,Ultrabook,14.0,Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Intel UHD Graphics 620,Windows 10,1.6kg,1149.0
451,554,561,HP,ProBook 470,Notebook,17.3,Full HD 1920x1080,Intel Core i7 7500U 2.7GHz,8GB,1TB HDD,Nvidia GeForce 930MX,Windows 10,2.63kg,1280.0
296,1115,1130,Dell,XPS 13,Ultrabook,13.3,Full HD 1920x1080,Intel Core i7 7560U 2.4GHz,8GB,256GB SSD,Intel Iris Plus Graphics 640,Windows 10,1.23kg,1379.0
746,221,226,Lenovo,Yoga 520-14IKB,2 in 1 Convertible,14.0,IPS Panel Full HD 1920x1080,Intel Core i5 8250U 1.6GHz,8GB,256GB SSD,Intel UHD Graphics 620,Windows 10,1.74kg,899.0


# Data cleaning

In [86]:
def transfo(df=df):    
    # Screen 
    screen_type = []
    for res in df.ScreenResolution:
        if (re.search(r'Ultra HD', res)) or (re.search(r'2560x1440', res)) or (re.search(r'Quad HD+', res)) != None:
            screen_type.append('uhd')
        elif (re.search(r'Touchscreen', res)) != None:
            screen_type.append('touchscreen')
        elif (re.search(r'Retina', res)) != None:
            screen_type.append('ips_retina')
        elif (re.search(r'IPS', res)) != None:
            screen_type.append('ips')
        elif (re.search(r'Full HD', res)) or (re.search(r'1920x1080', res)) != None:
            screen_type.append('full_hd')
        elif (re.search(r'1366x768', res)) or (re.search(r'1600x900', res)) or (re.search(r'1440x900', res)) != None:
            screen_type.append('hd')
    
    df['screen_type'] = screen_type
    
    Resolution = []
    for res in df.ScreenResolution:
        Resolution.append(re.search(r'([0-9]{4})x([0-9]{3,4})',res).group())      # Separo resolución 
    df['Resolution'] = Resolution
    
    df['Inches'] = df.Inches.apply(lambda x: str(x))                              #Funciona mejor como var categórica                                                 #Columnas binarias con tipo pantalla
    
    # Separating first word of the product
    df['product_name'] = df.Product.str.split(' ').str[0]
    df['product_name'] = df.product_name.str.split('-').str[0]
    
    # CPU
    df['cpu_brand'] = df.Cpu.str.split(' ').str[0]
    df['cpu_ghz'] = df.Cpu.str.split(' ').str[-1]
    
    intel_gen = []
    for cpu in df.Cpu: 
        intel_gen.append(re.search(r"i[0-9]", cpu)[0]) if (re.search(r"i[0-9]", cpu))!= None else intel_gen.append(0)
        
    df['intel_gen'] = intel_gen
    df['Cpu'] = df.Cpu.str.rsplit(' ',1).str[0]       #Quito los Ghz
    df['cpu_model'] = df.Cpu.str.split(' ').str[-1]   #Modelo de la CPU
    df['cpu_2'] = df.Cpu.str.rsplit(' ').str[1]       #
    
    # MEMORY
    
    df['memory_1'] = df.Memory.apply(lambda string: string.split('+')[0])
            # Separo la segunda memoria si la string es larga y reemplazo si no tiene.
    df['memory_2'] = df.Memory.apply(lambda string: string.split('+')[-1] if len(string) > 19 else 0)
            # Separo tamaño disco y tipo de memoria
    df['memory_1_type'] = df.memory_1.apply(lambda string: string.strip().split(' ')[-1])
    df['memory_2_type'] = df.memory_2.str.strip().str.split(' ').str[-1].fillna(0)
    df['memory_1'] = df.memory_1.apply(lambda string: string.split(' ')[0]).str.replace('1.0TB','1TB').str.replace('TB','000').str.replace('GB','')
    df['memory_2'] = df.memory_2.str.lstrip(' ').apply(lambda x: x.split(' ')[0] if (type(x) == str) else 0).apply(lambda x: x.replace('1.0TB','1TB')if (type(x) == str) else 0).apply(lambda x: x.replace('TB','000') if (type(x) == str) else 0).apply(lambda x:x.replace('GB','') if (type(x) == str) else 0)
    
    # GPU Cleaning
    df.loc[(df.Gpu == 'Nvidia GeForce GTX 960<U+039C>'),'Gpu'] = 'Nvidia GeForce GTX 960'
    
    df.Gpu.replace(re.compile("(GTX)(\d+)").pattern, r"\1 \2", regex=True,inplace=True)
    df.Gpu = df.Gpu.str.replace('Nvidia GeForce GT 940MX', 'Nvidia GeForce 940MX').str.replace(
                                 'Nvidia GeForce GTX1050 Ti','Nvidia GeForce GTX 1050 Ti').apply(
                                lambda x: x.replace(x,'Nvidia Geforce GTX 980') if '980' in x else x).str.strip()
    
    # Cleaning values to digits
    df.Ram = df.Ram.str.replace('GB','')
    df.Weight = df.Weight.str.replace('kg','')
    df.cpu_ghz = df.cpu_ghz.str.replace('GHz','')
    # Values to numeric
    df['Weight'] = df.Weight.apply(lambda x: float(x))
    df['cpu_ghz'] = df.cpu_ghz.apply(lambda x: float(x))
    
    # Dropping unnecessary cols
    df.drop(columns=['laptop_ID','id','ScreenResolution', 'Product','ScreenResolution','Cpu','Memory'],inplace=True)

transfo()

  df['memory_1'] = df.memory_1.apply(lambda string: string.split(' ')[0]).str.replace('1.0TB','1TB').str.replace('TB','000').str.replace('GB','')


### X-y

In [87]:
y = df.Price_euros
X = df.copy()
X.drop(columns='Price_euros',inplace=True)
X.select_dtypes(include=['float64']).columns
X.dtypes
# Unimportant columns.
X.drop(columns=['cpu_brand','memory_2'],inplace=True)

In [88]:
X.columns

Index(['Company', 'TypeName', 'Inches', 'Ram', 'Gpu', 'OpSys', 'Weight',
       'screen_type', 'Resolution', 'product_name', 'cpu_ghz', 'intel_gen',
       'cpu_model', 'cpu_2', 'memory_1', 'memory_1_type', 'memory_2_type'],
      dtype='object')

### Scaling numeric variables + cat conversion

In [89]:
#Convierto variables categóricas a categoría

def categ_func(X = X):
    global cat_features
    cat_features = ['Ram','product_name','Inches','Company', 'TypeName','OpSys','Resolution','screen_type','intel_gen', 'cpu_model','cpu_2','memory_1_type', 'memory_2_type','memory_1','Gpu']
    X[cat_features] = X[cat_features].astype('category')

categ_func() 

#Escalo variables numéricas

numeric_features = X.select_dtypes(include=['float64']).columns
sc = StandardScaler()
X[numeric_features] = sc.fit_transform(X[numeric_features])  

## Train/test - Feature Selection 

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)

In [91]:
from catboost import Pool, cv

#Feature Selection with Catboost

cat = CatBoostRegressor(random_seed=10)
cat.fit(X_train,y_train,cat_features=cat_features)

cat.get_feature_importance(
Pool(X_train, y_train, cat_features= cat_features))

pd.DataFrame({'feature_importance': cat.get_feature_importance(Pool(X_train, y_train, cat_features=cat_features)), 
              'feature_names': X.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

Learning rate set to 0.038948
0:	learn: 699.2975119	total: 12.5ms	remaining: 12.5s
1:	learn: 683.8937961	total: 26.4ms	remaining: 13.2s
2:	learn: 668.7305864	total: 40.6ms	remaining: 13.5s
3:	learn: 655.0507608	total: 53.6ms	remaining: 13.3s
4:	learn: 641.2074145	total: 66.1ms	remaining: 13.2s
5:	learn: 628.2277771	total: 78.6ms	remaining: 13s
6:	learn: 616.5460569	total: 91.8ms	remaining: 13s
7:	learn: 604.9672172	total: 104ms	remaining: 13s
8:	learn: 594.7860766	total: 117ms	remaining: 12.9s
9:	learn: 585.3025532	total: 130ms	remaining: 12.9s
10:	learn: 574.8684924	total: 143ms	remaining: 12.8s
11:	learn: 565.7211562	total: 155ms	remaining: 12.8s
12:	learn: 556.1738803	total: 168ms	remaining: 12.7s
13:	learn: 547.3907654	total: 181ms	remaining: 12.7s
14:	learn: 538.6917262	total: 195ms	remaining: 12.8s
15:	learn: 530.1027940	total: 211ms	remaining: 13s
16:	learn: 522.2527065	total: 224ms	remaining: 12.9s
17:	learn: 515.5152351	total: 237ms	remaining: 12.9s
18:	learn: 508.1954261	tota

158:	learn: 240.5914245	total: 2.12s	remaining: 11.2s
159:	learn: 240.4381073	total: 2.13s	remaining: 11.2s
160:	learn: 239.3461613	total: 2.14s	remaining: 11.2s
161:	learn: 238.5010792	total: 2.16s	remaining: 11.2s
162:	learn: 237.9769408	total: 2.17s	remaining: 11.1s
163:	learn: 237.8503452	total: 2.18s	remaining: 11.1s
164:	learn: 237.6449223	total: 2.2s	remaining: 11.1s
165:	learn: 237.2386252	total: 2.21s	remaining: 11.1s
166:	learn: 236.6103191	total: 2.23s	remaining: 11.1s
167:	learn: 236.5703184	total: 2.23s	remaining: 11.1s
168:	learn: 235.9087490	total: 2.25s	remaining: 11.1s
169:	learn: 235.5248657	total: 2.26s	remaining: 11s
170:	learn: 235.1701307	total: 2.27s	remaining: 11s
171:	learn: 234.5155729	total: 2.29s	remaining: 11s
172:	learn: 234.0744594	total: 2.3s	remaining: 11s
173:	learn: 233.1371395	total: 2.31s	remaining: 11s
174:	learn: 232.7957721	total: 2.33s	remaining: 11s
175:	learn: 232.4401530	total: 2.34s	remaining: 11s
176:	learn: 231.9874545	total: 2.35s	remaini

318:	learn: 182.8768082	total: 4.25s	remaining: 9.06s
319:	learn: 182.7921332	total: 4.26s	remaining: 9.05s
320:	learn: 182.5256877	total: 4.27s	remaining: 9.04s
321:	learn: 182.1862758	total: 4.29s	remaining: 9.03s
322:	learn: 182.1276069	total: 4.3s	remaining: 9.01s
323:	learn: 181.6661286	total: 4.31s	remaining: 9s
324:	learn: 181.6326489	total: 4.33s	remaining: 8.98s
325:	learn: 181.5210069	total: 4.34s	remaining: 8.97s
326:	learn: 181.1125045	total: 4.35s	remaining: 8.96s
327:	learn: 180.8603509	total: 4.36s	remaining: 8.94s
328:	learn: 180.7803424	total: 4.38s	remaining: 8.93s
329:	learn: 180.7437408	total: 4.39s	remaining: 8.91s
330:	learn: 180.7048116	total: 4.4s	remaining: 8.9s
331:	learn: 180.3438565	total: 4.42s	remaining: 8.89s
332:	learn: 180.2083425	total: 4.43s	remaining: 8.88s
333:	learn: 180.0934715	total: 4.44s	remaining: 8.86s
334:	learn: 179.9527635	total: 4.45s	remaining: 8.84s
335:	learn: 179.9219978	total: 4.47s	remaining: 8.83s
336:	learn: 179.5506182	total: 4.4

485:	learn: 148.1358219	total: 6.61s	remaining: 6.99s
486:	learn: 147.7606032	total: 6.63s	remaining: 6.98s
487:	learn: 147.4905075	total: 6.64s	remaining: 6.97s
488:	learn: 147.4052630	total: 6.66s	remaining: 6.96s
489:	learn: 147.2890509	total: 6.67s	remaining: 6.94s
490:	learn: 147.1568859	total: 6.68s	remaining: 6.93s
491:	learn: 146.7746258	total: 6.69s	remaining: 6.91s
492:	learn: 146.4564380	total: 6.71s	remaining: 6.9s
493:	learn: 146.4302897	total: 6.72s	remaining: 6.88s
494:	learn: 146.2176009	total: 6.73s	remaining: 6.87s
495:	learn: 145.8759896	total: 6.75s	remaining: 6.85s
496:	learn: 145.7369549	total: 6.76s	remaining: 6.84s
497:	learn: 145.4579909	total: 6.77s	remaining: 6.83s
498:	learn: 145.3727282	total: 6.79s	remaining: 6.81s
499:	learn: 145.1486498	total: 6.8s	remaining: 6.8s
500:	learn: 145.0564530	total: 6.81s	remaining: 6.79s
501:	learn: 145.0308854	total: 6.83s	remaining: 6.77s
502:	learn: 144.7139087	total: 6.84s	remaining: 6.76s
503:	learn: 144.4635160	total: 

640:	learn: 120.7613827	total: 8.74s	remaining: 4.89s
641:	learn: 120.4490048	total: 8.76s	remaining: 4.88s
642:	learn: 120.1195869	total: 8.77s	remaining: 4.87s
643:	learn: 119.8371517	total: 8.78s	remaining: 4.85s
644:	learn: 119.7710373	total: 8.79s	remaining: 4.84s
645:	learn: 119.6979690	total: 8.81s	remaining: 4.83s
646:	learn: 119.5576872	total: 8.82s	remaining: 4.81s
647:	learn: 119.2976230	total: 8.83s	remaining: 4.8s
648:	learn: 119.2219464	total: 8.85s	remaining: 4.78s
649:	learn: 118.9994880	total: 8.86s	remaining: 4.77s
650:	learn: 118.7398740	total: 8.87s	remaining: 4.76s
651:	learn: 118.3753474	total: 8.89s	remaining: 4.74s
652:	learn: 118.1210672	total: 8.9s	remaining: 4.73s
653:	learn: 117.9165416	total: 8.91s	remaining: 4.71s
654:	learn: 117.7139922	total: 8.93s	remaining: 4.7s
655:	learn: 117.6067786	total: 8.94s	remaining: 4.69s
656:	learn: 117.3658674	total: 8.95s	remaining: 4.67s
657:	learn: 117.2957587	total: 8.97s	remaining: 4.66s
658:	learn: 117.0953746	total: 

795:	learn: 98.8238581	total: 10.9s	remaining: 2.79s
796:	learn: 98.6535730	total: 10.9s	remaining: 2.78s
797:	learn: 98.5699795	total: 10.9s	remaining: 2.76s
798:	learn: 98.4967885	total: 10.9s	remaining: 2.75s
799:	learn: 98.4323097	total: 10.9s	remaining: 2.73s
800:	learn: 98.3401214	total: 11s	remaining: 2.72s
801:	learn: 98.2992789	total: 11s	remaining: 2.71s
802:	learn: 98.2068130	total: 11s	remaining: 2.69s
803:	learn: 98.0861876	total: 11s	remaining: 2.68s
804:	learn: 98.0005461	total: 11s	remaining: 2.67s
805:	learn: 97.9061477	total: 11s	remaining: 2.65s
806:	learn: 97.8405763	total: 11s	remaining: 2.64s
807:	learn: 97.7595282	total: 11s	remaining: 2.63s
808:	learn: 97.6086784	total: 11.1s	remaining: 2.61s
809:	learn: 97.5979006	total: 11.1s	remaining: 2.6s
810:	learn: 97.4302294	total: 11.1s	remaining: 2.58s
811:	learn: 97.4073797	total: 11.1s	remaining: 2.57s
812:	learn: 97.2773500	total: 11.1s	remaining: 2.56s
813:	learn: 97.2322085	total: 11.1s	remaining: 2.54s
814:	learn

952:	learn: 85.7130447	total: 13s	remaining: 643ms
953:	learn: 85.6635898	total: 13.1s	remaining: 629ms
954:	learn: 85.6463268	total: 13.1s	remaining: 616ms
955:	learn: 85.6132171	total: 13.1s	remaining: 602ms
956:	learn: 85.5600736	total: 13.1s	remaining: 588ms
957:	learn: 85.5280168	total: 13.1s	remaining: 575ms
958:	learn: 85.4348655	total: 13.1s	remaining: 561ms
959:	learn: 85.4232577	total: 13.1s	remaining: 547ms
960:	learn: 85.3551701	total: 13.1s	remaining: 534ms
961:	learn: 85.3363899	total: 13.2s	remaining: 520ms
962:	learn: 85.3068946	total: 13.2s	remaining: 506ms
963:	learn: 85.2223083	total: 13.2s	remaining: 493ms
964:	learn: 85.1828263	total: 13.2s	remaining: 479ms
965:	learn: 85.0945518	total: 13.2s	remaining: 465ms
966:	learn: 85.0186242	total: 13.2s	remaining: 451ms
967:	learn: 84.9364510	total: 13.2s	remaining: 438ms
968:	learn: 84.8369799	total: 13.3s	remaining: 425ms
969:	learn: 84.8143215	total: 13.3s	remaining: 411ms
970:	learn: 84.7778390	total: 13.3s	remaining: 3

Unnamed: 0,feature_importance,feature_names
3,16.680163,Ram
1,14.042693,TypeName
9,10.442026,product_name
6,8.821116,Weight
11,7.272895,intel_gen
2,6.429681,Inches
15,5.505794,memory_1_type
14,4.21242,memory_1
10,4.181806,cpu_ghz
4,4.142986,Gpu


## Model without optimization

In [92]:
cat = CatBoostRegressor(random_seed=10,cat_features=cat_features)
cat.fit(X_train,y_train,cat_features=cat_features)


Learning rate set to 0.038948
0:	learn: 699.2975119	total: 13.2ms	remaining: 13.2s
1:	learn: 683.8937961	total: 36.5ms	remaining: 18.2s
2:	learn: 668.7305864	total: 52.5ms	remaining: 17.4s
3:	learn: 655.0507608	total: 65.3ms	remaining: 16.2s
4:	learn: 641.2074145	total: 78.2ms	remaining: 15.6s
5:	learn: 628.2277771	total: 91.4ms	remaining: 15.1s
6:	learn: 616.5460569	total: 105ms	remaining: 14.9s
7:	learn: 604.9672172	total: 119ms	remaining: 14.8s
8:	learn: 594.7860766	total: 133ms	remaining: 14.7s
9:	learn: 585.3025532	total: 147ms	remaining: 14.5s
10:	learn: 574.8684924	total: 161ms	remaining: 14.5s
11:	learn: 565.7211562	total: 174ms	remaining: 14.4s
12:	learn: 556.1738803	total: 189ms	remaining: 14.3s
13:	learn: 547.3907654	total: 204ms	remaining: 14.3s
14:	learn: 538.6917262	total: 219ms	remaining: 14.4s
15:	learn: 530.1027940	total: 233ms	remaining: 14.4s
16:	learn: 522.2527065	total: 247ms	remaining: 14.3s
17:	learn: 515.5152351	total: 261ms	remaining: 14.2s
18:	learn: 508.19542

163:	learn: 237.8503452	total: 2.33s	remaining: 11.9s
164:	learn: 237.6449223	total: 2.34s	remaining: 11.9s
165:	learn: 237.2386252	total: 2.36s	remaining: 11.8s
166:	learn: 236.6103191	total: 2.37s	remaining: 11.8s
167:	learn: 236.5703184	total: 2.38s	remaining: 11.8s
168:	learn: 235.9087490	total: 2.4s	remaining: 11.8s
169:	learn: 235.5248657	total: 2.42s	remaining: 11.8s
170:	learn: 235.1701307	total: 2.44s	remaining: 11.8s
171:	learn: 234.5155729	total: 2.45s	remaining: 11.8s
172:	learn: 234.0744594	total: 2.47s	remaining: 11.8s
173:	learn: 233.1371395	total: 2.48s	remaining: 11.8s
174:	learn: 232.7957721	total: 2.5s	remaining: 11.8s
175:	learn: 232.4401530	total: 2.51s	remaining: 11.8s
176:	learn: 231.9874545	total: 2.53s	remaining: 11.7s
177:	learn: 231.3826408	total: 2.54s	remaining: 11.7s
178:	learn: 230.7035121	total: 2.56s	remaining: 11.7s
179:	learn: 230.6379280	total: 2.57s	remaining: 11.7s
180:	learn: 230.1502506	total: 2.59s	remaining: 11.7s
181:	learn: 229.4663387	total:

317:	learn: 182.9616830	total: 4.47s	remaining: 9.58s
318:	learn: 182.8768082	total: 4.48s	remaining: 9.57s
319:	learn: 182.7921332	total: 4.5s	remaining: 9.55s
320:	learn: 182.5256877	total: 4.51s	remaining: 9.54s
321:	learn: 182.1862758	total: 4.52s	remaining: 9.52s
322:	learn: 182.1276069	total: 4.54s	remaining: 9.51s
323:	learn: 181.6661286	total: 4.55s	remaining: 9.49s
324:	learn: 181.6326489	total: 4.56s	remaining: 9.48s
325:	learn: 181.5210069	total: 4.58s	remaining: 9.46s
326:	learn: 181.1125045	total: 4.59s	remaining: 9.45s
327:	learn: 180.8603509	total: 4.6s	remaining: 9.43s
328:	learn: 180.7803424	total: 4.62s	remaining: 9.41s
329:	learn: 180.7437408	total: 4.63s	remaining: 9.4s
330:	learn: 180.7048116	total: 4.64s	remaining: 9.38s
331:	learn: 180.3438565	total: 4.65s	remaining: 9.37s
332:	learn: 180.2083425	total: 4.67s	remaining: 9.35s
333:	learn: 180.0934715	total: 4.68s	remaining: 9.33s
334:	learn: 179.9527635	total: 4.69s	remaining: 9.31s
335:	learn: 179.9219978	total: 

477:	learn: 149.7100822	total: 6.61s	remaining: 7.22s
478:	learn: 149.4104981	total: 6.63s	remaining: 7.21s
479:	learn: 149.0223839	total: 6.64s	remaining: 7.19s
480:	learn: 148.8643376	total: 6.65s	remaining: 7.18s
481:	learn: 148.7230630	total: 6.67s	remaining: 7.16s
482:	learn: 148.6974937	total: 6.68s	remaining: 7.15s
483:	learn: 148.4393663	total: 6.69s	remaining: 7.13s
484:	learn: 148.3474636	total: 6.71s	remaining: 7.12s
485:	learn: 148.1358219	total: 6.72s	remaining: 7.11s
486:	learn: 147.7606032	total: 6.73s	remaining: 7.09s
487:	learn: 147.4905075	total: 6.74s	remaining: 7.08s
488:	learn: 147.4052630	total: 6.76s	remaining: 7.06s
489:	learn: 147.2890509	total: 6.77s	remaining: 7.05s
490:	learn: 147.1568859	total: 6.78s	remaining: 7.03s
491:	learn: 146.7746258	total: 6.79s	remaining: 7.02s
492:	learn: 146.4564380	total: 6.81s	remaining: 7s
493:	learn: 146.4302897	total: 6.82s	remaining: 6.99s
494:	learn: 146.2176009	total: 6.84s	remaining: 6.97s
495:	learn: 145.8759896	total: 

634:	learn: 121.7411370	total: 8.77s	remaining: 5.04s
635:	learn: 121.6918444	total: 8.79s	remaining: 5.03s
636:	learn: 121.5648431	total: 8.81s	remaining: 5.02s
637:	learn: 121.4846353	total: 8.82s	remaining: 5s
638:	learn: 121.2734704	total: 8.83s	remaining: 4.99s
639:	learn: 121.0610097	total: 8.85s	remaining: 4.98s
640:	learn: 120.7613827	total: 8.86s	remaining: 4.96s
641:	learn: 120.4490048	total: 8.87s	remaining: 4.95s
642:	learn: 120.1195869	total: 8.89s	remaining: 4.93s
643:	learn: 119.8371517	total: 8.9s	remaining: 4.92s
644:	learn: 119.7710373	total: 8.91s	remaining: 4.91s
645:	learn: 119.6979690	total: 8.93s	remaining: 4.89s
646:	learn: 119.5576872	total: 8.94s	remaining: 4.88s
647:	learn: 119.2976230	total: 8.95s	remaining: 4.86s
648:	learn: 119.2219464	total: 8.97s	remaining: 4.85s
649:	learn: 118.9994880	total: 8.98s	remaining: 4.83s
650:	learn: 118.7398740	total: 8.99s	remaining: 4.82s
651:	learn: 118.3753474	total: 9.01s	remaining: 4.81s
652:	learn: 118.1210672	total: 9

789:	learn: 99.3217881	total: 10.9s	remaining: 2.9s
790:	learn: 99.1563689	total: 10.9s	remaining: 2.89s
791:	learn: 99.0761249	total: 11s	remaining: 2.88s
792:	learn: 99.0318519	total: 11s	remaining: 2.86s
793:	learn: 98.9531237	total: 11s	remaining: 2.85s
794:	learn: 98.8360920	total: 11s	remaining: 2.83s
795:	learn: 98.8238581	total: 11s	remaining: 2.82s
796:	learn: 98.6535730	total: 11s	remaining: 2.81s
797:	learn: 98.5699795	total: 11s	remaining: 2.79s
798:	learn: 98.4967885	total: 11s	remaining: 2.78s
799:	learn: 98.4323097	total: 11.1s	remaining: 2.76s
800:	learn: 98.3401214	total: 11.1s	remaining: 2.75s
801:	learn: 98.2992789	total: 11.1s	remaining: 2.74s
802:	learn: 98.2068130	total: 11.1s	remaining: 2.72s
803:	learn: 98.0861876	total: 11.1s	remaining: 2.71s
804:	learn: 98.0005461	total: 11.1s	remaining: 2.69s
805:	learn: 97.9061477	total: 11.1s	remaining: 2.68s
806:	learn: 97.8405763	total: 11.2s	remaining: 2.67s
807:	learn: 97.7595282	total: 11.2s	remaining: 2.65s
808:	learn

947:	learn: 85.9365330	total: 13.3s	remaining: 729ms
948:	learn: 85.8497472	total: 13.3s	remaining: 715ms
949:	learn: 85.8174319	total: 13.3s	remaining: 701ms
950:	learn: 85.7473390	total: 13.3s	remaining: 687ms
951:	learn: 85.7238174	total: 13.4s	remaining: 673ms
952:	learn: 85.7130447	total: 13.4s	remaining: 659ms
953:	learn: 85.6635898	total: 13.4s	remaining: 646ms
954:	learn: 85.6463268	total: 13.4s	remaining: 632ms
955:	learn: 85.6132171	total: 13.4s	remaining: 618ms
956:	learn: 85.5600736	total: 13.4s	remaining: 604ms
957:	learn: 85.5280168	total: 13.5s	remaining: 590ms
958:	learn: 85.4348655	total: 13.5s	remaining: 576ms
959:	learn: 85.4232577	total: 13.5s	remaining: 562ms
960:	learn: 85.3551701	total: 13.5s	remaining: 548ms
961:	learn: 85.3363899	total: 13.5s	remaining: 534ms
962:	learn: 85.3068946	total: 13.5s	remaining: 520ms
963:	learn: 85.2223083	total: 13.5s	remaining: 506ms
964:	learn: 85.1828263	total: 13.6s	remaining: 492ms
965:	learn: 85.0945518	total: 13.6s	remaining:

<catboost.core.CatBoostRegressor at 0x1818397a850>

In [93]:
from sklearn import metrics

predictions = cat.predict(X_train)
pred_train = pd.Series(predictions)

print('MAE train', metrics.mean_absolute_error(y_train, pred_train))
print('MSE train', metrics.mean_squared_error(y_train, pred_train))
print('RMSE train', np.sqrt(metrics.mean_squared_error(y_train, pred_train)))
print('R2 train', cat.score(X_train,y_train))

predictions = cat.predict(X_test)
pred_test = pd.Series(predictions)
print('---------')
print('MAE test', metrics.mean_absolute_error(y_test, pred_test))
print('MSE test', metrics.mean_squared_error(y_test, pred_test))
print('RMSE test', np.sqrt(metrics.mean_squared_error(y_test, pred_test)))
print('R2 test', cat.score(X_test,y_test))

MAE train 104.25540932810134
MSE train 24091.579666061443
RMSE train 155.21462452379106
R2 train 0.9526988330021499
---------
MAE test 170.23839852555986
MSE test 63270.908809853114
RMSE test 251.53709231414183
R2 test 0.834458125909689


# Hyperoptimization

In [271]:

def hyperopt_train_test(params):
    model = CatBoostRegressor(**params,cat_features=cat_features,iterations=400)
    return cross_val_score(model,X_train,y_train,
                           cv=10,n_jobs=7).mean()

space = {
    'max_depth': hp.choice('max_depth',np.arange(1,8)),
    'loss_function': 'MAE',
    'task_type':'CPU',
    'random_state':10,
    'learning_rate': hp.uniform('learning_rate',0,1),
    'subsample': hp.uniform('subsample',0.2,1),
    'random_strength': hp.uniform('random_strength',0,1),
    'bagging_temperature':hp.uniform('bagging_temperature',0,1),
    "colsample_bylevel": hp.uniform("colsample_bylevel", 0, 1),
    'verbose': True,
    'reg_lambda': hp.uniform('reg_lambda', 0.5, 3),
    'early_stopping_rounds':993
}

def f(params):
    msee = hyperopt_train_test(params)
    return {'loss':-msee,'status': STATUS_OK}

trials = Trials()
best = fmin(f, space,algo=tpe.suggest, max_evals=270,trials=trials)
#space_eval(space, best)

100%|███████████████████████████████████████████| 270/270 [1:03:25<00:00, 14.09s/trial, best loss: -0.8405643326952953]


In [272]:
best

{'bagging_temperature': 0.5170671759562941,
 'colsample_bylevel': 0.8412167370635466,
 'learning_rate': 0.15324189403092525,
 'max_depth': 4,
 'random_strength': 0.5903333542461364,
 'reg_lambda': 0.9663295896207653,
 'subsample': 0.7683818191755833}

In [96]:
best = {'bagging_temperature': 0.7421565577099036,
 'colsample_bylevel': 0.5003107719868112,
 'learning_rate': 0.13705077055627676,
 'max_depth': 2,
 'random_strength': 0.9972462913338194,
 'reg_lambda': 0.7713789160987427,
 'subsample': 0.9850500004740638}


colsample_bylevel= best['colsample_bylevel']
learning_rate= best['learning_rate']

max_depth= best['max_depth']
subsample = best['subsample']
bagging_temperature = best['bagging_temperature']
random_strength = best['random_strength']
subsample = best['subsample']
reg_lambda = best['reg_lambda']

cat = CatBoostRegressor(iterations=10000,                  # Subir las iteraciones mejora el modelo
                       task_type='CPU',
                       loss_function= 'MAE',
                       verbose=False,
                       random_state=10,
                       random_strength = random_strength,
                       colsample_bylevel=colsample_bylevel,
                       learning_rate=learning_rate,
                       max_depth=max_depth,
                       bagging_temperature = bagging_temperature,
                       reg_lambda= reg_lambda,
                       subsample = subsample,
                       )
cat.fit(X_train,y_train,cat_features=cat_features)

<catboost.core.CatBoostRegressor at 0x1818398c0d0>

In [97]:
predictions = cat.predict(X_train)
pred_train = pd.Series(predictions)
print('MAE train', metrics.mean_absolute_error(y_train, pred_train))
print('MSE train', metrics.mean_squared_error(y_train, pred_train))
print('RMSE train', np.sqrt(metrics.mean_squared_error(y_train, pred_train)))
print('R2 train', cat.score(X_train,y_train))

predictions = cat.predict(X_test)
pred_test = pd.Series(predictions)
print('---------')
print('MAE test', metrics.mean_absolute_error(y_test, pred_test))
print('MSE test', metrics.mean_squared_error(y_test, pred_test))
print('RMSE test', np.sqrt(metrics.mean_squared_error(y_test, pred_test)))
print('R2 test', cat.score(X_test,y_test))

MAE train 121.17019118176613
MSE train 40715.50592788666
RMSE train 201.78083637423714
R2 train 0.9200595821447944
---------
MAE test 160.63065546395856
MSE test 62790.6034564207
RMSE test 250.58053287600117
R2 test 0.8357147958364917


 ## Prod

In [98]:
df2 = pd.read_csv('data/test.csv')
X1 = df2.copy()

In [99]:
transfo(X1)
X1.drop(columns=['cpu_brand','memory_2'],inplace=True)
categ_func(X1)

numeric_features = X1.select_dtypes(include=['float64']).columns
X1[numeric_features] = sc.transform(X1[numeric_features])


  df['memory_1'] = df.memory_1.apply(lambda string: string.split(' ')[0]).str.replace('1.0TB','1TB').str.replace('TB','000').str.replace('GB','')


In [2]:
pred = cat.predict(X1)
final = pd.DataFrame(pred,columns=['Price_euros'])
final.insert(0,'id',df2.id)
final.to_csv('data/predictions.csv',sep=',',float_format='%g',index=False)

NameError: name 'xgb' is not defined