In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy

# sklearn imports
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, make_scorer

In [47]:
train_data = pd.read_csv('train_values.csv', index_col='building_id')
train_labels = pd.read_csv('train_labels.csv', index_col='building_id')
test_values = pd.read_csv('test_values.csv', index_col = 'building_id')

In [48]:
train_values = train_data

In [49]:
train_values.info()

<class 'pandas.core.frame.DataFrame'>
Index: 260601 entries, 802906 to 747594
Data columns (total 38 columns):
 #   Column                                  Non-Null Count   Dtype 
---  ------                                  --------------   ----- 
 0   geo_level_1_id                          260601 non-null  int64 
 1   geo_level_2_id                          260601 non-null  int64 
 2   geo_level_3_id                          260601 non-null  int64 
 3   count_floors_pre_eq                     260601 non-null  int64 
 4   age                                     260601 non-null  int64 
 5   area_percentage                         260601 non-null  int64 
 6   height_percentage                       260601 non-null  int64 
 7   land_surface_condition                  260601 non-null  object
 8   foundation_type                         260601 non-null  object
 9   roof_type                               260601 non-null  object
 10  ground_floor_type                       260601 non-null 

In [50]:
# pipeline building

def log_pipeline():
    return make_pipeline(
        FunctionTransformer(np.log, feature_names_out='one-to-one',inverse_func=np.exp),
        StandardScaler()
    )
    
def cat_transformer():
    return make_pipeline(
        OneHotEncoder(handle_unknown='ignore')
    )
    
preprocessing = ColumnTransformer([
    ('cat', cat_transformer(), train_values.select_dtypes(include=object).columns.to_list())
], remainder = 'passthrough')


pipeline = Pipeline([
    ('preprocesing',preprocessing), 
    ('classifier',RandomForestClassifier(random_state = 42))
])

param_grid = {
    'classifier__n_estimators': [5, 50, 500],
    'classifier__min_samples_leaf':[1, 5, 10]
}

f1_scorer = make_scorer(f1_score, average = 'micro')

In [51]:
gs = GridSearchCV(pipeline, param_grid, cv=3, scoring=f1_scorer)
gs.fit(train_values, train_labels.values.ravel())

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [67]:
rs = RandomizedSearchCV(pipeline, param_grid, cv=3, scoring=f1_scorer)
rs.fit(train_values, train_labels.values.ravel())

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [61]:
gs_preds = gs.predict(test_values)

In [55]:
f1_score(train_labels, gs_preds, average='micro')

0.984297834620742

In [62]:
submission_format = pd.read_csv('submission_format.csv', index_col='building_id')

In [63]:
my_submission = pd.DataFrame(data=gs_preds,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [64]:
my_submission.to_csv('submission2.csv')

'head' is not recognized as an internal or external command,
operable program or batch file.


In [69]:
rs_preds = rs.predict(train_values)

In [70]:
f1_score(train_labels, rs_preds, average='micro')

0.984297834620742

In [71]:
rs_preds_test = rs.predict(test_values)

In [None]:
my_submission = pd.DataFrame(data=rs_preds_test,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [None]:
my_submission.to_csv('submission2.csv')