In [2]:
!pip install geopandas


Collecting geopandas
  Obtaining dependency information for geopandas from https://files.pythonhosted.org/packages/27/27/2687abaa2ac02b5814e2929a5033da1e5d132c19a904dc56f77f63fd6eb9/geopandas-0.14.0-py3-none-any.whl.metadata
  Downloading geopandas-0.14.0-py3-none-any.whl.metadata (1.5 kB)
Collecting fiona>=1.8.21 (from geopandas)
  Obtaining dependency information for fiona>=1.8.21 from https://files.pythonhosted.org/packages/b0/7f/2de46a2630f609b7520d74ffc7692d4969b1fa1dd3c82f62c7967183d365/Fiona-1.9.4.post1-cp311-cp311-win_amd64.whl.metadata
  Downloading Fiona-1.9.4.post1-cp311-cp311-win_amd64.whl.metadata (50 kB)
     ---------------------------------------- 0.0/50.6 kB ? eta -:--:--
     ---------------------------------------- 50.6/50.6 kB 2.7 MB/s eta 0:00:00
Collecting pyproj>=3.3.0 (from geopandas)
  Obtaining dependency information for pyproj>=3.3.0 from https://files.pythonhosted.org/packages/c8/5a/215a1894e50167d91b471d8fc413ca30034c48e5d3dfac78d12df4c840d5/pyproj-3.6.0-cp

In [105]:
#importing libraries
import geopandas as gpd
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [5]:

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5}

In [198]:
## Read csvs

train_df = gpd.read_file('train.geojson', index_col=0)
test_df = gpd.read_file('test.geojson', index_col=0)

# Feature Engineering

In [199]:
# Create a copy of the test DataFrame to preserve the original data
test_df1 = test_df.copy()

In [200]:
# Create a copy of the train DataFrame to preserve the original data
train_df1 = train_df.copy()

In [201]:
# Count the number of occurrences for each unique value in the 'change_status_date1' column
train_df1['change_status_date1'].value_counts()

Land Cleared            98071
Prior Construction      91489
Greenland               84616
Construction Midway     15972
Construction Started     8147
Materials Dumped         8029
NA                       1577
Operational               890
Excavation                626
Construction Done         589
Name: change_status_date1, dtype: int64

In [202]:
# Count the number of occurrences for each unique value in the 'change_status_date2' column
train_df1['change_status_date2'].value_counts()

Land Cleared            86354
Prior Construction      76955
Greenland               69045
Construction Midway     31810
Construction Done       25716
Materials Dumped         9221
Construction Started     7346
NA                       1512
Operational              1167
Excavation                880
Name: change_status_date2, dtype: int64

In [203]:
# Count the number of occurrences for each unique value in the 'change_status_date3' column
train_df1['change_status_date3'].value_counts()

Land Cleared            71884
Construction Done       64616
Prior Construction      62987
Greenland               48238
Construction Midway     40710
Materials Dumped         9576
Construction Started     7203
NA                       2029
Operational              1760
Excavation               1003
Name: change_status_date3, dtype: int64

In [204]:
# Count the number of occurrences for each unique value in the 'change_status_date4' column
train_df1['change_status_date4'].value_counts()

Construction Done       112985
Land Cleared             58198
Prior Construction       47759
Construction Midway      41638
Greenland                27943
Materials Dumped         10203
Construction Started      5933
Operational               2635
NA                        1577
Excavation                1135
Name: change_status_date4, dtype: int64

In [205]:
# Count the number of occurrences for each unique value in the 'change_status_date5' column
train_df1['change_status_date5'].value_counts()

Construction Done       178240
Land Cleared             38172
Prior Construction       35432
Construction Midway      23718
Greenland                14819
Materials Dumped         10136
Operational               3720
Construction Started      3224
NA                        1512
Excavation                1033
Name: change_status_date5, dtype: int64

## train_df

In [206]:
# Filter out rows in train_df1 where 'change_status_date5' is not equal to 'NA'
train_df1 = train_df1[train_df1['change_status_date5'] != 'NA']

In [207]:
# Filter out rows in train_df1 where 'change_status_date1' is not equal to 'NA'
train_df1 = train_df1[train_df1['change_status_date1'] != 'NA']

In [208]:
# Filter out rows in train_df1 where 'change_status_date2' is not equal to 'NA'
train_df1 = train_df1[train_df1['change_status_date2'] != 'NA']

In [210]:
# Filter out rows in train_df1 where 'change_status_date4' is not equal to 'NA'
train_df1 = train_df1[train_df1['change_status_date4'] != 'NA']

In [216]:
# Filter out rows in train_df1 where 'geography_types' is not equal to 'None'
train_df1 = train_df1[train_df1['geography_types'] != 'None']

In [217]:
# Filter out rows in train_df1 where 'urban_types' is not equal to 'None'
train_df1 = train_df1[train_df1['urban_types'] != 'None']

### For the encoding of geography_types and urban_types, I did not use the one-hot encoder because the number of categories in the test_df and train_df variables for different geography_types was not the same. We have 180 categories in train_df and 159 in test_df. I opted for the LabelEncoder in order to maintain the same number of variables in both test_df and train_df

In [218]:
# Encoding the categorical column 'geography_types''
le = LabelEncoder()
train_df1['geography_types'] = le.fit_transform(train_df1.geography_types)

In [219]:
# Encoding the categorical column 'urban_types'
le = LabelEncoder()
train_df1['urban_types'] = le.fit_transform(train_df1.geography_types)

In [220]:
from shapely.geometry import shape

# Calculate the area for each polygon in train_df1

for index, row in train_df1.iterrows():
    geometry = row['geometry']
    superficie = shape(geometry).area
    train_df1.at[index, 'superficie'] = superficie

# Display the DataFrame with the new 'area' column
train_df1



Unnamed: 0,index,change_type,change_status_date1,change_status_date2,change_status_date3,change_status_date4,change_status_date5,date1,date2,date3,date4,date5,urban_types,geography_types,geometry,superficie
0,0,Commercial,Land Cleared,Construction Midway,Construction Done,Construction Done,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,135,135,"POLYGON ((116.97563 38.89002, 116.97590 38.890...",1.237159e-06
1,1,Commercial,Greenland,Greenland,Construction Done,Construction Done,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,156,156,"POLYGON ((116.97500 38.88969, 116.97524 38.889...",4.701495e-07
2,2,Commercial,Land Cleared,Land Cleared,Construction Done,Construction Done,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,156,156,"POLYGON ((116.97519 38.88847, 116.97568 38.888...",4.536881e-07
3,3,Commercial,Land Cleared,Land Cleared,Construction Midway,Construction Midway,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,135,135,"POLYGON ((116.97630 38.89017, 116.97730 38.890...",2.936349e-07
4,4,Commercial,Land Cleared,Land Cleared,Construction Started,Construction Midway,Construction Done,29-05-2014,13-09-2015,25-02-2017,10-10-2018,19-05-2020,135,135,"POLYGON ((116.97751 38.89037, 116.97854 38.890...",2.905206e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309999,309999,Commercial,Greenland,Greenland,Greenland,Greenland,Construction Done,18-11-2014,01-03-2016,01-01-2018,01-05-2019,01-04-2020,119,119,"POLYGON ((139.81493 35.60963, 139.81484 35.609...",3.080072e-08
310001,310001,Residential,Prior Construction,Prior Construction,Prior Construction,Land Cleared,Construction Done,18-11-2014,01-03-2016,01-01-2018,01-05-2019,01-04-2020,42,42,"POLYGON ((139.80642 35.60416, 139.80597 35.603...",1.428818e-07
310002,310002,Residential,Land Cleared,Land Cleared,Land Cleared,Land Cleared,Construction Done,18-11-2014,01-03-2016,01-01-2018,01-05-2019,01-04-2020,142,142,"POLYGON ((139.80857 35.60407, 139.80831 35.603...",6.767191e-08
310003,310003,Commercial,Materials Dumped,Land Cleared,Land Cleared,Land Cleared,Construction Done,18-11-2014,01-03-2016,01-01-2018,01-05-2019,01-04-2020,142,142,"POLYGON ((139.80927 35.60445, 139.80889 35.604...",8.923550e-08


In [221]:
# Encoding the categorical column 'change_status_date4'
le = LabelEncoder()
train_df1['change_status_date4'] = le.fit_transform(train_df1.change_status_date4)

In [222]:
# Encoding the categorical column 'change_status_date1'
le = LabelEncoder()
train_df1['change_status_date1'] = le.fit_transform(train_df1.change_status_date1)

In [223]:
# Encoding the categorical column 'change_status_date2'
le = LabelEncoder()
train_df1['change_status_date2'] = le.fit_transform(train_df1.change_status_date2)

In [224]:
# Encoding the categorical column 'change_status_date3'
le = LabelEncoder()
train_df1['change_status_date3'] = le.fit_transform(train_df1.change_status_date3)

In [225]:
# Encoding the categorical column 'change_status_date5'
le = LabelEncoder()
train_df1['change_status_date5'] = le.fit_transform(train_df1.change_status_date5)

In [227]:
from datetime import datetime
# Convert strings to datetime objects
train_df1['date1'] = train_df1['date1'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))
train_df1['date2'] = train_df1['date2'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))
train_df1['date3'] = train_df1['date3'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))
train_df1['date4'] = train_df1['date4'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))
train_df1['date5'] = train_df1['date5'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))

# Calculate the number of days between consecutive dates
train_df1['jours_entre_1_et_2'] = (train_df1['date2'] - train_df1['date1']).dt.days
train_df1['jours_entre_2_et_3'] = (train_df1['date3'] - train_df1['date2']).dt.days
train_df1['jours_entre_3_et_4'] = (train_df1['date4'] - train_df1['date3']).dt.days
train_df1['jours_entre_4_et_5'] = (train_df1['date5'] - train_df1['date4']).dt.days



In [228]:
# Remove the columns 'date1', 'date2', 'date3', 'date4', 'date5' from train_df1
train_df1 = train_df1.drop(columns=['date1'], axis=1)
train_df1 = train_df1.drop(columns=['date2'], axis=1)
train_df1 = train_df1.drop(columns=['date3'], axis=1)
train_df1 = train_df1.drop(columns=['date4'], axis=1)
train_df1 = train_df1.drop(columns=['date5'], axis=1)

In [229]:
# Remove the columns'geometry'from train_df1
train_df1 = train_df1.drop(columns=['geometry'], axis=1)

In [231]:
train_df1

Unnamed: 0,index,change_type,change_status_date1,change_status_date2,change_status_date3,change_status_date4,change_status_date5,urban_types,geography_types,superficie,jours_entre_1_et_2,jours_entre_2_et_3,jours_entre_3_et_4,jours_entre_4_et_5
0,0,Commercial,5,1,0,0,0,135,135,1.237159e-06,472,531,592,587
1,1,Commercial,4,4,0,0,0,156,156,4.701495e-07,472,531,592,587
2,2,Commercial,5,5,0,0,0,156,156,4.536881e-07,472,531,592,587
3,3,Commercial,5,5,1,1,0,135,135,2.936349e-07,472,531,592,587
4,4,Commercial,5,5,2,1,0,135,135,2.905206e-07,472,531,592,587
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
309999,309999,Commercial,4,4,4,4,0,119,119,3.080072e-08,469,671,485,336
310001,310001,Residential,8,8,8,5,0,42,42,1.428818e-07,469,671,485,336
310002,310002,Residential,5,5,5,5,0,142,142,6.767191e-08,469,671,485,336
310003,310003,Commercial,6,5,5,5,0,142,142,8.923550e-08,469,671,485,336


# test_df

In [None]:
# Filter out rows in test_df1 where 'change_status_date1' is not equal to 'NA'
test_df1 = test_df1[test_df1['change_status_date1'] != 'NA']

In [None]:
# Filter out rows in test_df1 where 'change_status_date2' is not equal to 'NA'
test_df1 = test_df1[test_df1['change_status_date2'] != 'NA']

In [None]:
# Filter out rows in train_df1 where 'change_status_date3' is not equal to 'NA'
train_df1 = train_df1[train_df1['change_status_date3'] != 'NA']

In [None]:
# Filter out rows in test_df1 where 'change_status_date4' is not equal to 'NA'
test_df1 = test_df1[test_df1['change_status_date4'] != 'NA']

In [None]:
# Filter out rows in test_df1 where 'change_status_date5' is not equal to 'NA'
test_df1 = test_df1[test_df1['change_status_date5'] != 'NA']

In [232]:
# Filter out rows in test_df1 where 'geography_types' is not equal to 'None'
test_df1 = test_df1[test_df1['geography_types'] != 'None']

In [233]:
# Filter out rows in test_df1 where 'urban_types' is not equal to 'None'
test_df1 = test_df1[test_df1['urban_types'] != 'None']

In [234]:
# Encoding the categorical column 'geography_types'
le = LabelEncoder()
test_df1['geography_types'] = le.fit_transform(test_df1.geography_types)

In [235]:
# Encoding the categorical column 'urban_types'
le = LabelEncoder()
test_df1['urban_types'] = le.fit_transform(test_df1.urban_types)

In [236]:
from shapely.geometry import shape
import pandas as pd

# Calculate the area for each polygon in train_df1
for index, row in test_df1.iterrows():
    geometry = row['geometry']
    superficie = shape(geometry).area
    test_df1.at[index, 'superficie'] = superficie
# Display the DataFrame with the new 'area' column
test_df1

Unnamed: 0,index,change_status_date1,change_status_date2,change_status_date3,change_status_date4,change_status_date5,date1,date2,date3,date4,date5,urban_types,geography_types,geometry,superficie
12,12,Prior Construction,Prior Construction,Prior Construction,Prior Construction,Land Cleared,20-11-2014,29-11-2015,16-05-2017,27-10-2018,19-03-2020,5,14,"POLYGON ((103.97511 36.01026, 103.97553 36.010...",1.676424e-07
13,13,Prior Construction,Prior Construction,Prior Construction,Land Cleared,Land Cleared,20-11-2014,29-11-2015,16-05-2017,27-10-2018,19-03-2020,5,14,"POLYGON ((103.97568 36.00978, 103.97568 36.009...",8.368769e-08
19,19,Greenland,Land Cleared,Land Cleared,Construction Done,Construction Done,20-11-2014,29-11-2015,16-05-2017,27-10-2018,19-03-2020,7,142,"POLYGON ((103.97320 36.00849, 103.97337 36.008...",5.485025e-08
29,29,Prior Construction,Prior Construction,Prior Construction,Prior Construction,Land Cleared,20-11-2014,29-11-2015,16-05-2017,27-10-2018,19-03-2020,7,142,"POLYGON ((103.97346 36.00801, 103.97389 36.007...",5.734509e-08
30,30,Prior Construction,Prior Construction,Prior Construction,Prior Construction,Land Cleared,20-11-2014,29-11-2015,16-05-2017,27-10-2018,19-03-2020,7,142,"POLYGON ((103.97345 36.00784, 103.97379 36.007...",9.594095e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121699,121699,Land Cleared,Land Cleared,Construction Done,Construction Done,Construction Done,30-07-2014,09-02-2015,02-03-2017,19-08-2018,22-01-2019,7,137,"POLYGON ((113.21206 29.37929, 113.21213 29.379...",1.468961e-08
121700,121700,Greenland,Construction Done,Construction Done,Construction Done,Construction Done,30-07-2014,09-02-2015,02-03-2017,19-08-2018,22-01-2019,7,141,"POLYGON ((113.21203 29.37706, 113.21212 29.376...",2.950373e-08
121701,121701,Greenland,Greenland,Greenland,Construction Midway,Construction Done,30-07-2014,09-02-2015,02-03-2017,19-08-2018,22-01-2019,7,141,"POLYGON ((113.21187 29.37609, 113.21188 29.375...",3.579027e-08
121702,121702,Land Cleared,Land Cleared,Construction Midway,Construction Done,Construction Done,30-07-2014,09-02-2015,02-03-2017,19-08-2018,22-01-2019,7,126,"POLYGON ((113.21158 29.37573, 113.21162 29.375...",2.281689e-08


In [237]:
# Encoding the categorical column 'change_status_date'
le = LabelEncoder()
test_df1['change_status_date4'] = le.fit_transform(test_df1.change_status_date4)

In [238]:
# Encoding the categorical column 'change_status_date'
le = LabelEncoder()
test_df1['change_status_date1'] = le.fit_transform(test_df1.change_status_date1)

In [239]:
# Encoding the categorical column 'change_status_date'
le = LabelEncoder()
test_df1['change_status_date2'] = le.fit_transform(test_df1.change_status_date2)

In [240]:
# Encoding the categorical column 'change_status_date'
le = LabelEncoder()
test_df1['change_status_date3'] = le.fit_transform(test_df1.change_status_date3)

In [241]:
# Encoding the categorical column 'change_status_date'
le = LabelEncoder()
test_df1['change_status_date5'] = le.fit_transform(test_df1.change_status_date5)

In [242]:
from datetime import datetime
# Convert strings to datetime objects
test_df1['date1'] = test_df1['date1'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))
test_df1['date2'] = test_df1['date2'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))
test_df1['date3'] = test_df1['date3'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))
test_df1['date4'] = test_df1['date4'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))
test_df1['date5'] = test_df1['date5'].apply(lambda x: datetime.strptime(x, '%d-%m-%Y'))


# Calculate the number of days between consecutive dates
test_df1['jours_entre_1_et_2'] = (test_df1['date2'] - test_df1['date1']).dt.days
test_df1['jours_entre_2_et_3'] = (test_df1['date3'] - test_df1['date2']).dt.days
test_df1['jours_entre_3_et_4'] = (test_df1['date4'] - test_df1['date3']).dt.days
test_df1['jours_entre_4_et_5'] = (test_df1['date5'] - test_df1['date4']).dt.days


In [243]:
# Remove the columns 'date1', 'date2', 'date3', 'date4', 'date5' from test_df1
test_df1 = test_df1.drop(columns=['date1'], axis=1)
test_df1 = test_df1.drop(columns=['date2'], axis=1)
test_df1 = test_df1.drop(columns=['date3'], axis=1)
test_df1 = test_df1.drop(columns=['date4'], axis=1)
test_df1 = test_df1.drop(columns=['date5'], axis=1)

In [244]:
# Remove the columns 'geometry' from test_df1
test_df1 = test_df1.drop(columns=['geometry'], axis=1)

In [245]:
test_df1

Unnamed: 0,index,change_status_date1,change_status_date2,change_status_date3,change_status_date4,change_status_date5,urban_types,geography_types,superficie,jours_entre_1_et_2,jours_entre_2_et_3,jours_entre_3_et_4,jours_entre_4_et_5
12,12,8,8,8,8,5,5,14,1.676424e-07,374,534,529,509
13,13,8,8,8,5,5,5,14,8.368769e-08,374,534,529,509
19,19,4,5,5,0,0,7,142,5.485025e-08,374,534,529,509
29,29,8,8,8,8,5,7,142,5.734509e-08,374,534,529,509
30,30,8,8,8,8,5,7,142,9.594095e-08,374,534,529,509
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121699,121699,5,5,0,0,0,7,137,1.468961e-08,194,752,535,156
121700,121700,4,0,0,0,0,7,141,2.950373e-08,194,752,535,156
121701,121701,4,4,4,1,0,7,141,3.579027e-08,194,752,535,156
121702,121702,5,5,1,0,0,7,126,2.281689e-08,194,752,535,156


# prediction and Evaluation

In [248]:
# Defining the features  (train_x)  and target (train_y)
train_x =  train_df1.drop(columns = ['change_type'])
train_y = train_df1['change_type'].apply(lambda x: change_type_map[x])

In [256]:
test_x

Unnamed: 0,index,change_status_date1,change_status_date2,change_status_date3,change_status_date4,change_status_date5,urban_types,geography_types,superficie,jours_entre_1_et_2,jours_entre_2_et_3,jours_entre_3_et_4,jours_entre_4_et_5
12,12,8,8,8,8,5,5,14,1.676424e-07,374,534,529,509
13,13,8,8,8,5,5,5,14,8.368769e-08,374,534,529,509
19,19,4,5,5,0,0,7,142,5.485025e-08,374,534,529,509
29,29,8,8,8,8,5,7,142,5.734509e-08,374,534,529,509
30,30,8,8,8,8,5,7,142,9.594095e-08,374,534,529,509
...,...,...,...,...,...,...,...,...,...,...,...,...,...
121699,121699,5,5,0,0,0,7,137,1.468961e-08,194,752,535,156
121700,121700,4,0,0,0,0,7,141,2.950373e-08,194,752,535,156
121701,121701,4,4,4,1,0,7,141,3.579027e-08,194,752,535,156
121702,121702,5,5,1,0,0,7,126,2.281689e-08,194,752,535,156


In [259]:

## Train a simple OnveVsRestClassifier using featurized data
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(train_x, train_y)
pred_y = neigh.predict(test_x)
print (pred_y.shape)
pred_y

(102204,)


array([3, 3, 3, ..., 2, 2, 2], dtype=int64)

## Here are the difficulties I encountered and the solutions I found at each level.



### .For the training dataframe (train_df), I noticed that the categories in the "geography_types" variable were not all the same as in the test dataframe (test_df), in fact they were more numerous in the train_df. 

### .The data was voluminous and it took me about 4 minutes to read the JSON files.

### .The missing values were identified as "None" and "NA" and were deleted as I didn't know the meaning of None. On the other hand, I couldn't impute the NA values because I have no knowledge of the origin of these missing values, so I decided to delete them too in order to have a correct analysis. 

### .As far as adding the calculated geometric data to the dataframe is concerned, I encountered a problem when adding the table converted to dataframe, which led to the appearance of missing values (NAN). So I calculated the area of the polygons in the dataframe using the shapely.geometry library.

### .I have not been able to evaluate the performance of my model, as I am unable to extract the target labels from the test_df dataset, despite knowing the indexes.