# <center>Preprocessing<center>

## Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_rows = 999

## Path to the data

In [None]:
PATH = 'Dataset'
PATH_TO_train_data = PATH + '/' + 'train.csv'
PATH_TO_test_data = PATH + '/' + 'test.csv'
PATH_TO_building_structure = PATH + '/' + 'Building_Structure.csv'
PATH_TO_building_ownership = PATH + '/' + 'Building_Ownership_Use.csv'

In [None]:
train_data = pd.read_csv(PATH_TO_train_data)

In [None]:
building_structure_data = pd.read_csv(PATH_TO_building_structure)
building_ownership_data = pd.read_csv(PATH_TO_building_ownership)

## Shape of different data sets

In [None]:
train_data.shape

In [None]:
building_structure_data.shape

In [None]:
building_ownership_data.shape

## Distribution of classes: 5 tier classification problem

In [None]:
%matplotlib inline

In [None]:
train_data.damage_grade.value_counts().plot('bar')

## Merge building structure data with train data using "building_id"

In [None]:
train_data = pd.merge(train_data, building_structure_data, on='building_id')

In [None]:
train_data.shape

## Merge building ownership data with train data using "building_id"

In [None]:
train_data = pd.merge(train_data, building_ownership_data, on='building_id')

In [None]:
train_data.shape

In [None]:
train_data.head()

## Data types of all columns. Change all the non-numeric columns into one hot representation.

In [None]:
train_data.dtypes

## Store all the columns in a list that are non numeric columns

In [None]:
object_type_columns = ['area_assesed', 'land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 'other_floor_type', 'position', 'plan_configuration', 'condition_post_eq', 'legal_ownership_status']

In [None]:
train_data.head()

## Change columns into one hot vectors

In [None]:
train_data = pd.get_dummies(train_data, columns=object_type_columns, prefix_sep='_', drop_first=True)

In [None]:
train_data.head()

## Find the columns that has missing values

In [None]:
columns_has_NaN = train_data.isna().sum()

In [None]:
columns_has_NaN = columns_has_NaN[columns_has_NaN.nonzero()[0]]

In [None]:
columns_has_NaN

In [None]:
total_NaN = columns_has_NaN.sum()
print("Total number of missing values in the training data = " + str(total_NaN))

## Replace missing values in the column "has repair started" and "count_families" as zero

In [None]:
train_data.fillna(0, inplace=True)

In [None]:
total_NaN = train_data.isna().sum().sum()
print("Total number of missing values in the training data = " + str(total_NaN))

## Train data after preprocessing

In [None]:
train_data.head()

## Remove target value from the train data.

In [None]:
X_train = train_data.drop(columns=['damage_grade', 'building_id'])
X_train.head()

In [None]:
y_train = pd.DataFrame(train_data.damage_grade)
y_train.head()

## Remove duplicate values from the data.

In [None]:
X_train.shape

In [None]:
duplicates = X_train.duplicated(keep=False)

In [None]:
X_train = X_train[~duplicates]

In [None]:
y_train = y_train[~duplicates]

In [None]:
total_number_of_features = len(X_train.columns)
print("Total number of features after pre processing = " + str(total_number_of_features))

In [None]:
y_train.damage_grade.value_counts().plot('bar')

## As the data is imbalanced, SMOTE algorithm is used to make data balanced.

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
smote = SMOTE(random_state=0)
os_data_X, os_data_y = smote.fit_sample(X_train, y_train.damage_grade)

In [None]:
X_train_os = pd.DataFrame(data = os_data_X, columns = X_train.columns)
y_train_os = pd.DataFrame(data = os_data_y, columns = ["damage_grade"])

In [None]:
X_train_os.shape, y_train_os.shape

In [None]:
y_train_os.damage_grade.value_counts().plot('bar')

In [None]:
y_train_os.head()

In [None]:
dup = X_train_os[X_train_os.duplicated(keep=False)]

In [None]:
dup.shape

As we can see, there are no duplicates in the train data.

## As per the intuition after seeing the data, the data is not random. So, randomize the data to avoid over fitting.

In [None]:
y_train_os.damage_grade.loc[:900000].value_counts().plot('bar')

In [None]:
no_of_rows_before_os = X_train.shape[0]
no_of_rows_after_os = X_train_os.shape[0]

In [None]:
X_train.index = range(len(X_train))
y_train.index = range(len(y_train))
X_train_os.index = range(len(X_train_os))
y_train_os.index = range(len(y_train_os))

In [None]:
y_train_os.shape

In [None]:
import random
index = [i for i in range(no_of_rows_after_os)]
random.shuffle(index)

In [None]:
X_train_os = X_train_os.set_index([index]).sort_index()

In [None]:
y_train_os = y_train_os.set_index([index]).sort_index()

In [None]:
y_train_os.damage_grade.loc[50000:100000].value_counts().plot('bar')

os - over sampling <br>
X_train, y_train are imbalanced. But, X_train_os and y_train_os are balanced after over sampling and shuffling.

## Create varaints of output variable.
1. Grade 1, Grade 2,..
2. 1, 2,...
3. One hot representation.

In [None]:
categories = {'Grade 1': 1, 'Grade 2': 2, 'Grade 3': 3, 'Grade 4': 4, 'Grade 5': 5}
y_train_categories = y_train.damage_grade.map(categories)
y_train_categories = pd.DataFrame({'damage_grade': y_train_categories})
y_train_categories.head()

In [None]:
categories = {'Grade 1': 1, 'Grade 2': 2, 'Grade 3': 3, 'Grade 4': 4, 'Grade 5': 5}
y_train_os_categories = y_train_os.damage_grade.map(categories)
y_train_os_categories = pd.DataFrame({'damage_grade': y_train_os_categories})
y_train_os_categories.head()

In [None]:
y_train_one_hot = pd.get_dummies(y_train)
y_train_one_hot.head()

In [None]:
y_train_os_one_hot = pd.get_dummies(y_train_os)
y_train_os_one_hot.head()

## Store all the variants of train data

In [None]:
X_train.to_csv('X_train.csv')
y_train.to_csv('y_train.csv')
y_train_categories.to_csv('y_train_categories.csv')
y_train_one_hot.to_csv('y_train_one_hot.csv')

In [None]:
X_train_os.to_csv('X_train_os.csv')
y_train_os.to_csv('y_train_os.csv')
y_train_os_categories.to_csv('y_train_os_categories.csv')
y_train_os_one_hot.to_csv('y_train_os_one_hot.csv')

## Test data: Preprocess test data similar to train data.

In [None]:
test_data = pd.read_csv(PATH_TO_test_data)
test_data = pd.merge(test_data, building_structure_data, on='building_id')
test_data = pd.merge(test_data, building_ownership_data, on='building_id')
test_data = pd.get_dummies(test_data, columns=object_type_columns, prefix_sep='_', drop_first=True)

columns_has_NaN = test_data.isna().sum()
columns_has_NaN = columns_has_NaN[columns_has_NaN.nonzero()[0]]

test_data.fillna(0, inplace=True)

X_test = test_data.drop(columns=['building_id'])

In [None]:
X_test.head()

In [None]:
X_test.to_csv('X_test.csv')