<h1> Importations

In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MultiLabelBinarizer

from feature_engine.creation import CyclicalTransformer


ModuleNotFoundError: No module named 'geopandas'

In [2]:
change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5}

In [3]:
## Read csvs

train_df_init = gpd.read_file('train.geojson', index_col=0)
test_df_init = gpd.read_file('test.geojson', index_col=0)

In [4]:
## Copying them
train_df = train_df_init.copy(deep=True)
test_df = test_df_init.copy(deep=True)

<h1> Cleaning data

In [5]:
train_df.head()

Unnamed: 0,index,change_type,change_status_date1,change_status_date2,change_status_date3,change_status_date4,change_status_date5,date1,date2,date3,date4,date5,urban_types,geography_types,geometry
0,0,Commercial,Land Cleared,Construction Midway,Construction Done,Construction Done,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Industrial,"River,Sparse Forest,Grass Land","POLYGON ((116.97563 38.89002, 116.97590 38.890..."
1,1,Commercial,Greenland,Greenland,Construction Done,Construction Done,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Sparse Urban,"Sparse Forest,Grass Land","POLYGON ((116.97500 38.88969, 116.97524 38.889..."
2,2,Commercial,Land Cleared,Land Cleared,Construction Done,Construction Done,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Sparse Urban,"Sparse Forest,Grass Land","POLYGON ((116.97519 38.88847, 116.97568 38.888..."
3,3,Commercial,Land Cleared,Land Cleared,Construction Midway,Construction Midway,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Industrial,"River,Sparse Forest,Grass Land","POLYGON ((116.97630 38.89017, 116.97730 38.890..."
4,4,Commercial,Land Cleared,Land Cleared,Construction Started,Construction Midway,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,Industrial,"River,Sparse Forest,Grass Land","POLYGON ((116.97751 38.89037, 116.97854 38.890..."


<h3>

<h2> Area for geometry

In [6]:
## Handling area value (We can choose something else)
train_df['geometry'] = train_df['geometry'].map(lambda a: a.area)
test_df['geometry'] = test_df['geometry'].map(lambda a: a.area)



<h2> One Hot encoding Urban types

In [7]:
feature = "urban_types"
class_maping = {label: idx for idx, label in enumerate(np.unique(train_df[feature]))}

# Careful : 15 different classes

X_train_urban = train_df[[feature]]
X_test_urban = test_df[[feature]]
X_train_urban = pd.get_dummies(X_train_urban)
X_test_urban = pd.get_dummies(X_test_urban)

In [8]:
## Drop the old urban_type column and replacing it with the one hot encoding
feature = "urban_types"
train_df = pd.concat([train_df.drop([feature], axis=1), X_train_urban], sort=False, axis=1)
test_df = pd.concat([test_df.drop([feature], axis=1), X_test_urban], sort=False, axis=1)


<h2> One Hot encoding geography

In [9]:
feature = "geography_types"
class_maping = {label: idx for idx, label in enumerate(np.unique(train_df[feature]))}
## Way too many class : not one-hot encoding : https://stats.stackexchange.com/questions/411767/encoding-of-categorical-variables-with-high-cardinality

all_geography = np.unique(train_df[feature])
print(all_geography[4].split(','))
visited=set()
for word_list in all_geography :
    word_list = word_list.split(',')
    for word in word_list :
        visited.add(word)
print(visited)

['Barren Land', 'Dense Forest', 'Grass Land']
{'Hills', 'None', 'River', 'Dense Forest', 'Barren Land', 'Snow', 'Farms', 'Desert', 'Lakes', 'Coastal', 'Sparse Forest', 'Grass Land'}


In [10]:
## Creating the one hot-encoding for lists
feature = "geography_types"
mlb = MultiLabelBinarizer()

geo_train = train_df[feature]
geo_train = geo_train.map(lambda geo_list: geo_list.split(','))
geo_test = test_df[feature]
geo_test = geo_test.map(lambda geo_list: geo_list.split(','))

geo_train = pd.DataFrame(mlb.fit_transform(geo_train),columns=mlb.classes_, index=geo_train.index)
geo_test = pd.DataFrame(mlb.fit_transform(geo_test),columns=mlb.classes_, index=geo_test.index)

In [11]:
## Drop the old geography type column and replacing it with the one hot encoding
feature = "geography_types"
train_df = pd.concat([train_df.drop([feature], axis=1), geo_train], sort=False, axis=1)
test_df = pd.concat([test_df.drop([feature], axis=1), geo_test], sort=False, axis=1)

<h2> Handling date columns

In [12]:
feature = "date1"
class_maping = {label: idx for idx, label in enumerate(np.unique(train_df[feature]))}


## More than 210 first date
## Need to find a there is a correlation
## Separate Year, month and day

In [13]:
## Fetch all date datas in a dictionary
dates = ["date1", "date2", "date3", "date4", "date5"]
train_date_dic = {}
test_date_dic = {}
for date in dates :
    train_date_dic["train_%s" % date] = train_df[date]
    test_date_dic["test_%s" % date] = test_df[date]


In [14]:
## Creating a year, month and day dataframe for each date
train_dates = ["train_date1", "train_date2", "train_date3", "train_date4", "train_date5"]
test_dates = ["test_date1", "test_date2", "test_date3", "test_date4", "test_date5"]

train_year_dic = {}
train_month_dic = {}
train_day_dic = {}
test_year_dic = {}
test_month_dic = {}
test_day_dic = {}

for i in range(5) :
    train_year_dic["train_year%s" % str(i+1)] = train_date_dic[train_dates[i]].map(lambda a: int(a[6:10])).rename("year%s" % str(i+1))
    train_month_dic["train_month%s" % str(i+1)] = train_date_dic[train_dates[i]].map(lambda a: int(a[3:5])).rename("month%s" % str(i+1))
    train_day_dic["train_day%s" % str(i+1)] = train_date_dic[train_dates[i]].map(lambda a: int(a[0:2])).rename("day%s" % str(i+1))

    test_year_dic["test_year%s" % str(i+1)] = test_date_dic[test_dates[i]].map(lambda a: int(a[6:10])).rename("year%s" % str(i+1))
    test_month_dic["test_month%s" % str(i+1)] = test_date_dic[test_dates[i]].map(lambda a: int(a[3:5])).rename("month%s" % str(i+1))
    test_day_dic["test_day%s" % str(i+1)] = test_date_dic[test_dates[i]].map(lambda a: int(a[0:2])).rename("day%s" % str(i+1))

In [15]:
## We can drop date columns and add these day - month - year column
dates = ["date1", "date2", "date3", "date4", "date5"]
for date in dates :
    train_df = train_df.drop([date], axis=1)
    test_df = test_df.drop([date], axis=1)
    
for i in range(1, 6):
    train_df = pd.concat([train_df, train_year_dic["train_year%s" % str(i)]], sort=False, axis=1)
    train_df = pd.concat([train_df, train_month_dic["train_month%s" % str(i)]], sort=False, axis=1)
    train_df = pd.concat([train_df, train_day_dic["train_day%s" % str(i)]], sort=False, axis=1)

    test_df = pd.concat([test_df, test_year_dic["test_year%s" % str(i)]], sort=False, axis=1)
    test_df = pd.concat([test_df, test_month_dic["test_month%s" % str(i)]], sort=False, axis=1)
    test_df = pd.concat([test_df, test_day_dic["test_day%s" % str(i)]], sort=False, axis=1)


In [16]:
## Month and day can't be used like this. 
## We need to add a cyclical variable sin and cos
## https://stats.stackexchange.com/questions/311494/best-practice-for-encoding-datetime-in-machine-learning

variables = []
numbers = [i for i in range(1, 6)]
time_variables = ["month", "day"]

for number in numbers :
    for time_variable in time_variables :
        variables.append("%s%s" % (time_variable, number))

## Issue with the transform method : need to have same number of column
## Solution : drop first column of train_df and add it after

cyclical = CyclicalTransformer(variables=variables, drop_original=True)
train_change_type_column = train_df["change_type"]
train_df = train_df.drop(["change_type"], axis=1)

train_df = cyclical.fit_transform(train_df)
test_df = cyclical.transform(test_df)

## Adding the dropped column

train_df = pd.concat([train_change_type_column, train_df], sort=False, axis=1)

## Last step : changing the order of columns to have the inital order
first_column = train_df.pop('index')
train_df.insert(0, 'index', first_column)

<h2> Last Step : Handling change_status_date

In [17]:
feature = "change_status_date4"
class_maping = {label: idx for idx, label in enumerate(np.unique(train_df[feature]))}

## 9 class labels for each columns

In [18]:
## One-hot-encoding for each columns
change_status_columns = ["change_status_date1", "change_status_date2", "change_status_date3", "change_status_date4", "change_status_date5"]

for feature in change_status_columns :

    X_train_status = train_df[[feature]]
    X_test_status = test_df[[feature]]
    X_train_status = pd.get_dummies(X_train_status)
    X_test_status = pd.get_dummies(X_test_status)

    train_df = pd.concat([train_df.drop([feature], axis=1), X_train_status], sort=False, axis=1)
    test_df = pd.concat([test_df.drop([feature], axis=1), X_test_status], sort=False, axis=1)


<h2> Save the dataframe as a csv and json on the hard disk

In [20]:
train_df.to_csv("preprocessed_train.csv", index=True, index_label='Id')
test_df.to_csv("preprocessed_test.csv", index=True, index_label='Id')

<h1> Preparation and reduction of the data

In [None]:
## The next goal is to choose the best features. Use the countless methode 
# like PCA and other reduction method. 

<h1> Building the model

In [None]:
## Building a model a the smaller dataset. When the model is strong 
# with few features, add other features + Hyperparameter Tuning