# Model Training

## 1.1 Import Data and Required Packages


In [31]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, classification_report
from sklearn.metrics import confusion_matrix

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
import matplotlib.pyplot as plt

from imblearn.over_sampling import SMOTE, RandomOverSampler
import warnings

In [32]:
df_features = pd.read_csv("data/train_values.csv")
df_labels = pd.read_csv("data/train_labels.csv")

In [33]:
df = pd.merge(df_features, df_labels, on='building_id')
df = df.drop(columns='building_id')

### Preparing X and y variables

In [34]:
X = df.drop(columns = ["damage_grade"])
y = df["damage_grade"]

In [35]:
### Sanity Check
print(f"Shape of X={(X.shape)}, Shape of y={(y.shape)}")

Shape of X=(260601, 38), Shape of y=(260601,)


### Convert categorical data to numerical data

In [36]:
cat_features = ['land_surface_condition', 'foundation_type', 'roof_type','ground_floor_type', 
                          'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
# label encoding categorical columns in train dataset 
X[cat_features] = X[cat_features].apply(lambda x: x.astype('category').cat.codes)

### Scaling the dataset

In [37]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

### Splitting into train, validation and test sets

In [38]:
RANDOM_SEED = 42

* Number of data points in Train set : 70 % of total no. of data points
* Number of data points in Validation set : 10 % of total no. of data points
* Number of data points in Test set : 20 % of total no. of data points


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

### Create an Evaluation function to give all metrics after model Training

In [40]:
def evaluate_model(true, predicted):
    """
    Generate the confusion matrix and classification report
    """
    cm = confusion_matrix(true, predicted)
    cr = classification_report(true, predicted)
    conf_matrix=pd.DataFrame(data=cm,columns=['Predicted:1','Predicted:2','Predicted:3'],
                                         index=['Actual:1','Actual:2','Actual:3'])
    # confusion matrix in heatmap
    sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()
    print(cr) 
    f1_micro = f1_score(true, predicted, average='micro')
    return f1_micro

## Model Training

### Without over and under sampling

In [None]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "DecisionTree Classifier": DecisionTreeClassifier()
}

model_list = []
f1_micro = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Validation dataset
    model_train_f1 = evaluate_model(y_train, y_train_pred)
    model_test_f1 = evaluate_model(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model Performance for Training Set")
    print("F1 Micro-Averaged Score: {:.4f}".format(model_train_f1))
    print("--------------------------------------")
    
    print("Model Performance for Validation Set")
    print("F1 Micro-Averaged Score: {:.4f}".format(model_test_f1))
    
    f1_micro.append(model_test_f1)
    
    print('='*35)
    print('\n')

### Results

In [None]:
results_without_sampling=pd.DataFrame(list(zip(model_list, f1_micro)), columns=['Model Name', 'Micro Averaged F1-Score']).sort_values(by=["Micro Averaged F1-Score"],ascending=False)

### ii. Oversampling

In [None]:
X_train_resample, y_train_resample = SMOTE().fit_resample(X_train, y_train)

In [None]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "DecisionTree Classifier": DecisionTreeClassifier()
}

model_list = []
f1_micro = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_resample, y_train_resample)
    
    # Make predictions
    y_train_pred = model.predict(X_train_resample)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Validation dataset
    model_train_f1 = evaluate_model(y_test_resample, y_train_pred)
    model_test_f1 = evaluate_model(y_test, y_val_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print("Model Performance for Training Set")
    print("F1 Micro-Averaged Score: {:.4f}".format(model_train_f1))
    print("--------------------------------------")
    
    print("Model Performance for Validation Set")
    print("F1 Micro-Averaged Score: {:.4f}".format(model_test_f1))
    
    f1_micro.append(model_test_f1)
    
    print('='*35)
    print('\n')

In [None]:
results_with_smote = pd.DataFrame(list(zip(model_list, f1_micro)), columns=['Model Name', 'Micro Averaged F1-Score']).sort_values(by=["Micro Averaged F1-Score"],ascending=False)

In [None]:
results_with_smote