In [16]:
import pandas as pd
pd.options.display.float_format = '{:20,.2f}'.format
import matplotlib.pyplot as plt
import numpy as np
# for preprocessing the data
from sklearn.preprocessing import StandardScaler

# the model
from sklearn.ensemble import RandomForestClassifier

# for combining the preprocess with model training
from sklearn.pipeline import make_pipeline

# for optimizing the hyperparameters of the pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import f1_score
from sklearn.tree import export_graphviz

from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

pd.options.display.max_seq_items = 20
from sklearn.linear_model import LogisticRegression

In [4]:
df_train_labels_original = pd.read_csv('train_labels.csv',low_memory=False,index_col='building_id')
df_train_values_original = pd.read_csv('train_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})
df_test_values_original = pd.read_csv('test_values.csv',low_memory=False, index_col='building_id', dtype= {
'geo_level_1_id':'uint8', 'geo_level_2_id':'uint16', 'geo_level_3_id':'uint16', 'count_floors_pre_eq':'uint8','age':'uint16', 'area_percentage':'uint16', 'height_percentage':'uint16', 
'land_surface_condition':'category', 'foundation_type':'category', 'roof_type':'category', 'ground_floor_type':'category', 'other_floor_type':'category', 'position':'category','plan_configuration':'category', 
'has_superstructure_adobe_mud':'bool', 'has_superstructure_mud_mortar_stone':'bool','has_superstructure_stone_flag':'bool', 'has_superstructure_cement_mortar_stone':'bool', 'has_superstructure_mud_mortar_brick':'bool', 'has_superstructure_cement_mortar_brick':'bool', 'has_superstructure_timber':'bool', 'has_superstructure_bamboo':'bool', 'has_superstructure_rc_non_engineered':'bool', 'has_superstructure_rc_engineered':'bool', 'has_superstructure_other':'bool', 
'legal_ownership_status':'category', 'count_families':'uint16', 
'has_secondary_use':'bool', 'has_secondary_use_agriculture':'bool', 'has_secondary_use_hotel':'bool', 'has_secondary_use_rental':'bool', 'has_secondary_use_institution':'bool', 'has_secondary_use_school':'bool', 'has_secondary_use_industry':'bool', 'has_secondary_use_health_post':'bool', 'has_secondary_use_gov_office':'bool', 'has_secondary_use_use_police':'bool', 'has_secondary_use_other':'bool',})

In [5]:
train_values_subset = pd.get_dummies(df_train_values_original)
train_labels_subset = df_train_labels_original['damage_grade']

validation_size = df_train_values_original.index.size - df_test_values_original.index.size

In [6]:
train_values, validation_values = (train_values_subset.iloc[0:173733], train_values_subset.iloc[173733:-1])
train_labels, validation_labels = (train_labels_subset.iloc[0:173733], train_labels_subset.iloc[173733:-1])

In [13]:
xgBoostPredictions = pd.read_csv('xgBoostPredictionProba.csv', index_col=0)
xgBoostPredictions

Unnamed: 0,xgb1,xgb2,xgb3
0,0.00,0.66,0.33
1,0.00,0.55,0.45
2,0.00,0.27,0.72
3,0.00,0.46,0.53
4,0.01,0.78,0.21
...,...,...,...
86862,0.00,0.48,0.52
86863,0.03,0.84,0.13
86864,0.00,0.06,0.94
86865,0.00,0.03,0.97


In [14]:
rndForestPredictions = pd.read_csv('randomForestPredictionProba', index_col=0)
rndForestPredictions

Unnamed: 0,rndf1,rndf2,rndf3
0,0.00,0.66,0.34
1,0.00,0.86,0.14
2,0.00,0.83,0.17
3,0.00,0.30,0.70
4,0.00,0.85,0.15
...,...,...,...
86862,0.00,0.73,0.27
86863,0.01,0.90,0.09
86864,0.00,0.01,0.99
86865,0.00,0.08,0.92


In [15]:
testData = pd.concat([rndForestPredictions , xgBoostPredictions], axis=1 )
testData

Unnamed: 0,rndf1,rndf2,rndf3,xgb1,xgb2,xgb3
0,0.00,0.66,0.34,0.00,0.66,0.33
1,0.00,0.86,0.14,0.00,0.55,0.45
2,0.00,0.83,0.17,0.00,0.27,0.72
3,0.00,0.30,0.70,0.00,0.46,0.53
4,0.00,0.85,0.15,0.01,0.78,0.21
...,...,...,...,...,...,...
86862,0.00,0.73,0.27,0.00,0.48,0.52
86863,0.01,0.90,0.09,0.03,0.84,0.13
86864,0.00,0.01,0.99,0.00,0.06,0.94
86865,0.00,0.08,0.92,0.00,0.03,0.97


In [24]:
logisticReg = LogisticRegression()

In [32]:
param_grid = {
    'solver': 'saga',
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C' : [0.5, 1, 2, 5]}

In [None]:
gs = GridSearchCV(logisticReg, param_grid)

In [25]:
logisticReg.fit(testData, validation_labels)

LogisticRegression()

In [27]:
predictions = logisticReg.predict(testData)

In [28]:
f1_score(validation_labels, predictions, average='micro')

0.74561110663428

0.74561110663428 primer intento ensamble con solver = 'lbfgs'