In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
import datetime
from sklearn import preprocessing
from sklearn.cluster import KMeans

In [3]:
train_df = pd.read_csv("train.csv", delimiter=',')
train_df

Unnamed: 0.1,Unnamed: 0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,price
0,48298,2.0,bus,2006,auto,140,c4,150000,gasoline,citroen,0.0,49191,380.0,4267
1,81047,,,2016,,0,vito,150000,,mercedes_benz,,45896,,2457
2,92754,2.2,limousine,2010,manual,175,mondeo,125000,diesel,ford,0.0,59229,930.0,10374
3,46007,,,2000,auto,265,andere,150000,gasoline,ford,0.0,39365,680.0,7098
4,76981,,convertible,3,manual,109,2_reihe,150000,gasoline,peugeot,0.0,55271,,2365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,50429,1.4,limousine,2006,manual,75,golf,90000,gasoline,volkswagen,0.0,35745,500.0,4686
49996,64425,1.3,small car,4,manual,60,fiesta,150000,gasoline,ford,0.0,60386,,864
49997,90761,,limousine,1996,manual,150,5er,150000,gasoline,bmw,0.0,28309,130.0,2275
49998,39709,,limousine,2007,manual,122,1er,100000,diesel,bmw,0.0,83623,500.0,8144


In [4]:
train_df['registration_year'] = train_df['registration_year'].apply(lambda x: int('200' + str(x)) if len(str(x)) == 1 else x)
train_df['registration_year'] = train_df['registration_year'].apply(lambda x: int('20' + str(x)) if (len(str(x)) == 2) & (str(x)[0] in ['0', '1']) else x)
train_df['registration_year'] = train_df['registration_year'].apply(lambda x: int('19' + str(x)) if (len(str(x)) == 2) else x)

train_df

Unnamed: 0.1,Unnamed: 0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,price
0,48298,2.0,bus,2006,auto,140,c4,150000,gasoline,citroen,0.0,49191,380.0,4267
1,81047,,,2016,,0,vito,150000,,mercedes_benz,,45896,,2457
2,92754,2.2,limousine,2010,manual,175,mondeo,125000,diesel,ford,0.0,59229,930.0,10374
3,46007,,,2000,auto,265,andere,150000,gasoline,ford,0.0,39365,680.0,7098
4,76981,,convertible,2003,manual,109,2_reihe,150000,gasoline,peugeot,0.0,55271,,2365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,50429,1.4,limousine,2006,manual,75,golf,90000,gasoline,volkswagen,0.0,35745,500.0,4686
49996,64425,1.3,small car,2004,manual,60,fiesta,150000,gasoline,ford,0.0,60386,,864
49997,90761,,limousine,1996,manual,150,5er,150000,gasoline,bmw,0.0,28309,130.0,2275
49998,39709,,limousine,2007,manual,122,1er,100000,diesel,bmw,0.0,83623,500.0,8144


In [5]:
zipcodes_df = pd.read_csv("zipcodes.csv", delimiter=',')
zipcodes_df = zipcodes_df.groupby('zipcode').agg('mean')
zipcodes_df = zipcodes_df.drop(columns='Unnamed: 0')

zipcodes_df

Unnamed: 0_level_0,latitude,longitude
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1
1067,51.050000,13.750000
1069,51.050000,13.750000
1097,51.050000,13.750000
1099,51.050000,13.750000
1108,51.050000,13.750000
...,...,...
99988,51.173574,10.291829
99991,51.151364,10.554708
99994,51.238641,10.667855
99996,51.288995,10.580125


In [6]:
joined_df = pd.merge(train_df,zipcodes_df,on='zipcode',how='left')
joined_df.index = joined_df["Unnamed: 0"]
joined_df = joined_df.drop(columns="Unnamed: 0")
joined_df

Unnamed: 0_level_0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,price,latitude,longitude
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
48298,2.0,bus,2006,auto,140,c4,150000,gasoline,citroen,0.0,49191,380.0,4267,52.304760,8.128460
81047,,,2016,,0,vito,150000,,mercedes_benz,,45896,,2457,51.517500,7.085750
92754,2.2,limousine,2010,manual,175,mondeo,125000,diesel,ford,0.0,59229,930.0,10374,51.759720,7.896940
46007,,,2000,auto,265,andere,150000,gasoline,ford,0.0,39365,680.0,7098,52.150717,11.213856
76981,,convertible,2003,manual,109,2_reihe,150000,gasoline,peugeot,0.0,55271,,2365,49.912200,8.125280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50429,1.4,limousine,2006,manual,75,golf,90000,gasoline,volkswagen,0.0,35745,500.0,4686,50.683300,8.316670
64425,1.3,small car,2004,manual,60,fiesta,150000,gasoline,ford,0.0,60386,,864,50.108358,8.658331
90761,,limousine,1996,manual,150,5er,150000,gasoline,bmw,0.0,28309,130.0,2275,53.075160,8.807770
39709,,limousine,2007,manual,122,1er,100000,diesel,bmw,0.0,83623,500.0,8144,47.850000,11.600000


In [7]:


x_train, x_validation, y_train, y_validation = train_test_split(joined_df.drop('price', axis=1), 
                                                      joined_df.price, test_size=0.1, random_state = 13)


In [8]:
x_train['type'] = x_train['type'].fillna("unknown")
le_type = preprocessing.LabelEncoder()
le_type.fit(x_train["type"])
x_train["enc_type"] = le_type.transform(x_train["type"])
x_train

Unnamed: 0_level_0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,latitude,longitude,enc_type
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
9733,2.2,limousine,2004,manual,125,vectra,150000,diesel,opel,0.0,30419,100.0,52.384470,9.726930,3
8060,,unknown,2016,,90,a3,150000,diesel,audi,,58579,,51.242890,7.523670,7
62290,,limousine,2001,auto,118,3er,125000,gasoline,bmw,0.0,44339,180.0,51.504225,7.483654,3
98500,,coupé,2009,auto,245,3er,150000,diesel,bmw,0.0,60487,1260.0,50.116700,8.683330,2
81475,,small car,2005,manual,60,fiesta,150000,gasoline,ford,0.0,50767,210.0,50.933300,6.950000,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25323,,station wagon,2000,manual,136,5er,150000,diesel,bmw,,96317,120.0,50.233300,11.316700,6
66229,2.4,station wagon,2005,manual,175,156,100000,diesel,alfa_romeo,0.0,24589,260.0,54.179610,9.877133,6
88080,,unknown,2016,manual,0,3er,150000,gasoline,bmw,,27755,30.0,53.052270,8.633760,7
98814,,coupé,2011,auto,71,fortwo,40000,gasoline,smart,0.0,88634,370.0,47.850000,9.200000,2


In [9]:
x_train['gearbox'] = x_train['gearbox'].fillna("unknown")
le_gearbox = preprocessing.LabelEncoder()
le_gearbox.fit(x_train["gearbox"])
x_train["enc_gearbox"] = le_gearbox.transform(x_train["gearbox"])
x_train

Unnamed: 0_level_0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,latitude,longitude,enc_type,enc_gearbox
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9733,2.2,limousine,2004,manual,125,vectra,150000,diesel,opel,0.0,30419,100.0,52.384470,9.726930,3,1
8060,,unknown,2016,unknown,90,a3,150000,diesel,audi,,58579,,51.242890,7.523670,7,2
62290,,limousine,2001,auto,118,3er,125000,gasoline,bmw,0.0,44339,180.0,51.504225,7.483654,3,0
98500,,coupé,2009,auto,245,3er,150000,diesel,bmw,0.0,60487,1260.0,50.116700,8.683330,2,0
81475,,small car,2005,manual,60,fiesta,150000,gasoline,ford,0.0,50767,210.0,50.933300,6.950000,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25323,,station wagon,2000,manual,136,5er,150000,diesel,bmw,,96317,120.0,50.233300,11.316700,6,1
66229,2.4,station wagon,2005,manual,175,156,100000,diesel,alfa_romeo,0.0,24589,260.0,54.179610,9.877133,6,1
88080,,unknown,2016,manual,0,3er,150000,gasoline,bmw,,27755,30.0,53.052270,8.633760,7,1
98814,,coupé,2011,auto,71,fortwo,40000,gasoline,smart,0.0,88634,370.0,47.850000,9.200000,2,0


In [10]:
x_train['fuel'] = x_train['fuel'].fillna("unknown")
le_fuel = preprocessing.LabelEncoder()
le_fuel.fit(x_train["fuel"])
x_train["enc_fuel"] = le_fuel.transform(x_train["fuel"])
x_train

Unnamed: 0_level_0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,latitude,longitude,enc_type,enc_gearbox,enc_fuel
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
9733,2.2,limousine,2004,manual,125,vectra,150000,diesel,opel,0.0,30419,100.0,52.384470,9.726930,3,1,1
8060,,unknown,2016,unknown,90,a3,150000,diesel,audi,,58579,,51.242890,7.523670,7,2,1
62290,,limousine,2001,auto,118,3er,125000,gasoline,bmw,0.0,44339,180.0,51.504225,7.483654,3,0,2
98500,,coupé,2009,auto,245,3er,150000,diesel,bmw,0.0,60487,1260.0,50.116700,8.683330,2,0,1
81475,,small car,2005,manual,60,fiesta,150000,gasoline,ford,0.0,50767,210.0,50.933300,6.950000,5,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25323,,station wagon,2000,manual,136,5er,150000,diesel,bmw,,96317,120.0,50.233300,11.316700,6,1,1
66229,2.4,station wagon,2005,manual,175,156,100000,diesel,alfa_romeo,0.0,24589,260.0,54.179610,9.877133,6,1,1
88080,,unknown,2016,manual,0,3er,150000,gasoline,bmw,,27755,30.0,53.052270,8.633760,7,1,2
98814,,coupé,2011,auto,71,fortwo,40000,gasoline,smart,0.0,88634,370.0,47.850000,9.200000,2,0,2


In [11]:
le_brand = preprocessing.LabelEncoder()
le_brand.fit(x_train["brand"])
x_train["enc_brand"] = le_brand.transform(x_train["brand"])

x_train['model'] = x_train['model'].fillna("unknown")
le_model = preprocessing.LabelEncoder()
le_model.fit(x_train["model"])
x_train["enc_model"] = le_model.transform(x_train["model"])
x_train

Unnamed: 0_level_0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,latitude,longitude,enc_type,enc_gearbox,enc_fuel,enc_brand,enc_model
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
9733,2.2,limousine,2004,manual,125,vectra,150000,diesel,opel,0.0,30419,100.0,52.384470,9.726930,3,1,1,24,231
8060,,unknown,2016,unknown,90,a3,150000,diesel,audi,,58579,,51.242890,7.523670,7,2,1,1,28
62290,,limousine,2001,auto,118,3er,125000,gasoline,bmw,0.0,44339,180.0,51.504225,7.483654,3,0,2,2,11
98500,,coupé,2009,auto,245,3er,150000,diesel,bmw,0.0,60487,1260.0,50.116700,8.683330,2,0,1,2,11
81475,,small car,2005,manual,60,fiesta,150000,gasoline,ford,0.0,50767,210.0,50.933300,6.950000,5,1,2,10,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25323,,station wagon,2000,manual,136,5er,150000,diesel,bmw,,96317,120.0,50.233300,11.316700,6,1,1,2,15
66229,2.4,station wagon,2005,manual,175,156,100000,diesel,alfa_romeo,0.0,24589,260.0,54.179610,9.877133,6,1,1,0,3
88080,,unknown,2016,manual,0,3er,150000,gasoline,bmw,,27755,30.0,53.052270,8.633760,7,1,2,2,11
98814,,coupé,2011,auto,71,fortwo,40000,gasoline,smart,0.0,88634,370.0,47.850000,9.200000,2,0,2,32,107


In [12]:
# x_train['latitude'] = x_train['latitude'].fillna(x_train.latitude.mean())
# x_train['longitude'] = x_train['longitude'].fillna(x_train.longitude.mean())
x_train['latitude'] = x_train['latitude'].fillna(-1)
x_train['longitude'] = x_train['longitude'].fillna(-1)
geo_cluster = KMeans(n_clusters=100, random_state=0)
geo_cluster.fit(x_train[['longitude', 'latitude']])

x_train["cluster"] = geo_cluster.predict(x_train[['longitude', 'latitude']])
x_train

Unnamed: 0_level_0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,zipcode,insurance_price,latitude,longitude,enc_type,enc_gearbox,enc_fuel,enc_brand,enc_model,cluster
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
9733,2.2,limousine,2004,manual,125,vectra,150000,diesel,opel,0.0,30419,100.0,52.384470,9.726930,3,1,1,24,231,18
8060,,unknown,2016,unknown,90,a3,150000,diesel,audi,,58579,,51.242890,7.523670,7,2,1,1,28,92
62290,,limousine,2001,auto,118,3er,125000,gasoline,bmw,0.0,44339,180.0,51.504225,7.483654,3,0,2,2,11,50
98500,,coupé,2009,auto,245,3er,150000,diesel,bmw,0.0,60487,1260.0,50.116700,8.683330,2,0,1,2,11,54
81475,,small car,2005,manual,60,fiesta,150000,gasoline,ford,0.0,50767,210.0,50.933300,6.950000,5,1,2,10,103,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25323,,station wagon,2000,manual,136,5er,150000,diesel,bmw,,96317,120.0,50.233300,11.316700,6,1,1,2,15,26
66229,2.4,station wagon,2005,manual,175,156,100000,diesel,alfa_romeo,0.0,24589,260.0,54.179610,9.877133,6,1,1,0,3,23
88080,,unknown,2016,manual,0,3er,150000,gasoline,bmw,,27755,30.0,53.052270,8.633760,7,1,2,2,11,48
98814,,coupé,2011,auto,71,fortwo,40000,gasoline,smart,0.0,88634,370.0,47.850000,9.200000,2,0,2,32,107,97


In [13]:
x_validation['type'] = x_validation['type'].fillna("unknown")
x_validation["enc_type"] = le_type.transform(x_validation["type"])

x_validation['gearbox'] = x_validation['gearbox'].fillna("unknown")
x_validation["enc_gearbox"] = le_gearbox.transform(x_validation["gearbox"])

x_validation['fuel'] = x_validation['fuel'].fillna("unknown")
x_validation["enc_fuel"] = le_fuel.transform(x_validation["fuel"])

x_validation["enc_brand"] = le_brand.transform(x_validation["brand"])

x_validation['model'] = x_validation['model'].fillna("unknown")
x_validation.loc[~x_validation["model"].isin(x_train["model"]), "model"] = "unknown"
x_validation["enc_model"] = le_model.transform(x_validation["model"])

# x_validation['latitude'] = x_validation['latitude'].fillna(x_train.latitude.mean())
# x_validation['longitude'] = x_validation['longitude'].fillna(x_train.longitude.mean())
x_validation['latitude'] = x_validation['latitude'].fillna(-1)
x_validation['longitude'] = x_validation['longitude'].fillna(-1)
x_validation["cluster"] = geo_cluster.predict(x_validation[['longitude', 'latitude']])


In [14]:
x_train['insurance_is_null'] = x_train['insurance_price'].isna()
x_train['damage_is_null'] = x_train['damage'].isna()
x_train['mileage_small'] = x_train['mileage'] < 100000
x_train['model_is_nan'] = x_train['model'].isna()
x_validation['insurance_is_null'] = x_validation['insurance_price'].isna()
x_validation['damage_is_null'] = x_validation['damage'].isna()
x_validation['mileage_small'] = x_validation['mileage'] < 100000
x_validation['model_is_nan'] = x_validation['model'].isna()

In [15]:
x_train.columns

Index(['engine_capacity', 'type', 'registration_year', 'gearbox', 'power',
       'model', 'mileage', 'fuel', 'brand', 'damage', 'zipcode',
       'insurance_price', 'latitude', 'longitude', 'enc_type', 'enc_gearbox',
       'enc_fuel', 'enc_brand', 'enc_model', 'cluster', 'insurance_is_null',
       'damage_is_null', 'mileage_small', 'model_is_nan'],
      dtype='object')

In [16]:
categorical_feature=["enc_type", "enc_gearbox", "enc_fuel", "enc_brand", "enc_model", "insurance_is_null", "damage_is_null",  "model_is_nan", "cluster", "mileage_small",]
for c in categorical_feature:
    x_train[c] = x_train[c].astype('category')
    x_validation[c] = x_validation[c].astype('category')

In [17]:
eval_set = [(x_validation.drop(['type', "gearbox", "model", "fuel", "zipcode", "brand", "latitude", "longitude", ], 1), y_validation)]

In [30]:
model3 = lgb.LGBMRegressor(random_state=13, max_depth=10, num_leaves = 450, boosting='dart', n_estimators=3000, reg_sqrt=True, objective="mape", lambda_l2=1, lambda_l1=0, feature_fraction=0.7, max_bin=300, learning_rate=0.1, min_child_samples=30, cat_smooth=40, max_cat_to_onehot=3)
model3.fit(x_train.drop(['type', "model", "gearbox", "fuel", "zipcode", "brand", "latitude", "longitude", ], 1),y_train,
           feature_name = ['engine_capacity', 'registration_year', 'power', 'cluster',
         'damage', 'mileage', "mileage_small",
       'insurance_price', 'enc_type', 'enc_gearbox',
       'enc_fuel', 'enc_brand', 'enc_model',  'insurance_is_null', 'damage_is_null', 'model_is_nan'], verbose = -1, categorical_feature=categorical_feature, eval_set=eval_set,
    eval_metric="mape")

New categorical_feature is ['cluster', 'damage_is_null', 'enc_brand', 'enc_fuel', 'enc_gearbox', 'enc_model', 'enc_type', 'insurance_is_null', 'mileage_small', 'model_is_nan']




LGBMRegressor(boosting='dart', cat_smooth=40, feature_fraction=0.7, lambda_l1=0,
              lambda_l2=1, max_bin=300, max_cat_to_onehot=3, max_depth=10,
              min_child_samples=30, n_estimators=3000, num_leaves=450,
              objective='mape', random_state=13, reg_sqrt=True)

In [31]:
answers_train3 = model3.predict(x_validation.drop(['type', "gearbox", "model", "fuel", "zipcode", "brand", "latitude", "longitude",], 1))

In [32]:
def mean_absolute_percentage_error(y_true, y_pred): 
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [33]:
mean_absolute_percentage_error(y_validation, answers_train3)

21.80145430561156

In [34]:
test_df = pd.read_csv("test_no_target.csv", delimiter=',')

test_df['type'] = test_df['type'].fillna("unknown")
test_df["enc_type"] = le_type.transform(test_df["type"])

test_df['gearbox'] = test_df['gearbox'].fillna("unknown")
test_df["enc_gearbox"] = le_gearbox.transform(test_df["gearbox"])

test_df['fuel'] = test_df['fuel'].fillna("unknown")
test_df["enc_fuel"] = le_fuel.transform(test_df["fuel"])

test_df["enc_brand"] = le_brand.transform(test_df["brand"])

test_df['model'] = test_df['model'].fillna("unknown")
test_df.loc[~test_df["model"].isin(x_train["model"]), "model"] = "unknown"
test_df["enc_model"] = le_model.transform(test_df["model"])

test_df = pd.merge(test_df,zipcodes_df,on='zipcode',how='left')
test_df.index = test_df["Unnamed: 0"]
test_df = test_df.drop(columns="Unnamed: 0")

# test_df['latitude'] = test_df['latitude'].fillna(x_train.latitude.mean())
# test_df['longitude'] = test_df['longitude'].fillna(x_train.longitude.mean())
test_df['latitude'] = test_df['latitude'].fillna(-1)
test_df['longitude'] = test_df['longitude'].fillna(-1)
test_df["cluster"] = geo_cluster.predict(test_df[['longitude', 'latitude']])

test_df['registration_year'] = test_df['registration_year'].apply(lambda x: int('200' + str(x)) if len(str(x)) == 1 else x)
test_df['registration_year'] = test_df['registration_year'].apply(lambda x: int('20' + str(x)) if (len(str(x)) == 2) & (str(x)[0] in ['0', '1']) else x)
test_df['registration_year'] = test_df['registration_year'].apply(lambda x: int('19' + str(x)) if (len(str(x)) == 2) else x)

test_df['insurance_is_null'] = test_df['insurance_price'].isna()
test_df['damage_is_null'] = test_df['damage'].isna()
test_df['mileage_small'] = test_df['mileage'] < 100000
test_df['model_is_nan'] = test_df['model'].isna()

for c in categorical_feature:
    test_df[c] = test_df[c].astype('category')
    
test_df

Unnamed: 0_level_0,engine_capacity,type,registration_year,gearbox,power,model,mileage,fuel,brand,damage,...,enc_fuel,enc_brand,enc_model,latitude,longitude,cluster,insurance_is_null,damage_is_null,mileage_small,model_is_nan
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60314,1.6,small car,2013,manual,136,swift,40000,gasoline,suzuki,0.0,...,2,35,212,52.384470,9.726930,18,False,False,True,False
12566,,coupé,2004,auto,333,6er,150000,gasoline,bmw,0.0,...,2,2,18,51.462488,7.008645,62,False,False,False,False
17760,,station wagon,2006,auto,170,e_klasse,150000,diesel,mercedes_benz,0.0,...,1,20,97,51.566980,8.110620,15,False,False,False,False
8876,,limousine,1999,manual,101,astra,150000,gasoline,opel,,...,2,24,43,53.919857,9.518576,44,True,True,False,False
80392,,limousine,1975,manual,54,andere,150000,diesel,mercedes_benz,0.0,...,1,20,40,48.666700,9.216670,49,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93878,1.4,limousine,1999,manual,86,corolla,150000,gasoline,toyota,0.0,...,2,36,83,51.504225,7.483654,50,False,False,False,False
99783,,station wagon,2002,auto,184,3er,150000,diesel,bmw,0.0,...,1,2,11,51.679880,6.156480,32,False,False,False,False
57399,,small car,2005,manual,52,fox,100000,gasoline,volkswagen,0.0,...,2,38,108,50.833300,6.983330,80,False,False,False,False
97106,,bus,2001,manual,151,transporter,150000,diesel,volkswagen,0.0,...,1,38,220,52.484160,13.317215,5,False,False,False,False


In [35]:
answers_test = model3.predict(test_df.drop(['type', "gearbox", "model", "fuel", "zipcode", "brand", "latitude", "longitude",], 1))

In [36]:
res = pd.DataFrame(data=test_df.index, index=np.arange(len(test_df)),columns=['Id'])
res['Id'] = test_df.index
res['Predicted'] = answers_test
res.to_csv("submission.csv", index=False)

In [37]:
model3.booster_.save_model('lgbr_base_final.txt')

<lightgbm.basic.Booster at 0x1c6cc0a1190>

In [28]:
import pandas_profiling
train_df.profile_report()

Summarize dataset:   0%|          | 0/27 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

