### 0. Import Required Libraries

In [2]:
%matplotlib inline
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



## 1. Loading Data

In [3]:
df_test = pd.read_csv('../data/test_values.csv')
df_train = pd.read_csv('../data/train_values.csv')
df_label = pd.read_csv('../data/train_labels.csv')

In [4]:
print(df_train.shape)
print(df_label.shape)
print(df_test.shape)

(260601, 39)
(260601, 2)
(86868, 39)


## 2. Data Preparation

### 2.1. Data Cleaning

In [7]:
df_train_summary = pd.DataFrame({
    "Data type": df_train.dtypes,
    "Any nulls?": df_train.isnull().any(),
    "Unique values": df_train.nunique()
})
print(df_train_summary)

                                       Data type  Any nulls?  Unique values
building_id                                int64       False         260601
geo_level_1_id                             int64       False             31
geo_level_2_id                             int64       False           1414
geo_level_3_id                             int64       False          11595
count_floors_pre_eq                        int64       False              9
age                                        int64       False             42
area_percentage                            int64       False             84
height_percentage                          int64       False             27
land_surface_condition                    object       False              3
foundation_type                           object       False              5
roof_type                                 object       False              3
ground_floor_type                         object       False              5
other_floor_

Since there is no null, no data cleaning yet. (for rows)

### 2.2. Dropping Columns

In [5]:
columns_with_objects = df_train.select_dtypes(include=['object']).columns
print(columns_with_objects)

Index(['land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'legal_ownership_status'],
      dtype='object')


In [9]:
import preprocess as pp

df_numerical = pp.drop_columns(df_train, columns_with_objects)

In [15]:
df_numerical.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260601 entries, 0 to 260600
Data columns (total 31 columns):
 #   Column                                  Non-Null Count   Dtype
---  ------                                  --------------   -----
 0   building_id                             260601 non-null  int64
 1   geo_level_1_id                          260601 non-null  int64
 2   geo_level_2_id                          260601 non-null  int64
 3   geo_level_3_id                          260601 non-null  int64
 4   count_floors_pre_eq                     260601 non-null  int64
 5   age                                     260601 non-null  int64
 6   area_percentage                         260601 non-null  int64
 7   height_percentage                       260601 non-null  int64
 8   has_superstructure_adobe_mud            260601 non-null  int64
 9   has_superstructure_mud_mortar_stone     260601 non-null  int64
 10  has_superstructure_stone_flag           260601 non-null  int64
 11  

### 2.3. Encoding

NO ENCODING YET V1

In [4]:
import encoding_category as ec
train_ec, test_ec = ec.encode_cat(df_train, df_test)

In [None]:
test_ec.info()

## 3. Modeling: Selection and Implementation

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

# Test-train split
X_train, X_test, y_train, y_test = train_test_split(df_numerical, df_label["damage_grade"], test_size=0.2, random_state=42)

# With out numerical data frame "df_numerical" we want to implement a random forest model
rf_model = RandomForestClassifier(criterion='gini',n_estimators=150,max_depth=4,n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predictions
preds = rf_model.predict(X_test)

# Evaluate the model
score = mean_absolute_error(y_test, preds)

# We want to evaluate our model with micro average f1 score
from sklearn.metrics import f1_score
f1_score(y_test, preds, average='micro')

0.5664319564091249

In [24]:
# We want to see how many times our model predicted each class
from collections import Counter
Counter(preds)

Counter({2: 52075, 1: 46})

## 4. Evaluation

## 5. Predictions Output