In [None]:
import geopandas as gpd


geojson_file = 'train.geojson'
gdf = gpd.read_file(geojson_file)



xlsx = 'train.xlsx'
gdf.to_excel(xlsx, index=False)


geojson_file = 'test.geojson'
gdf = gpd.read_file(geojson_file)



xlsx = 'test.xlsx'
gdf.to_excel(xlsx, index=False)

In [None]:

import import_ipynb
import processing_data
import pandas as pd
df_test = pd.read_excel('test.xlsx')
processing_data.processed_file(df_test,'test_processed')

df_train = pd.read_excel('train.xlsx')
processing_data.processed_file(df_train,'train_processed')

From there, I used models that I thought were good based on the f1 score metric I was using and some grid research. Using these models, I predicted the labels I needed, and in the end I came 6th out of over 130 competitors in the kaggle competition.

Using just one classifier

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import numpy as np

X = pd.read_excel('train_processed.xlsx')

X_all = X.drop(columns=['change_type'])
y_all = X['change_type']

label_encoder = LabelEncoder()
y_all_encoded = label_encoder.fit_transform(y_all)

dall = xgb.DMatrix(X_all, label=y_all_encoded)

params = {
    'objective': 'multi:softmax',
    'num_class': len(set(y_all_encoded)),
    'eval_metric': 'mlogloss',
    'max_depth': 8,
    'learning_rate': 0.24
}

num_rounds = 1695
bst = xgb.train(params, dall, num_rounds)

new_data = pd.read_excel('test_processed.xlsx')

X_new = new_data

dnew = xgb.DMatrix(X_new)

y_pred_encoded = bst.predict(dnew)

y_pred = label_encoder.inverse_transform(y_pred_encoded.astype(int))

print("Predictions on the new data:")
print(y_pred)

correspondance = {
    'Demolition': 0,
    'Road': 1,
    'Residential': 2,
    'Commercial': 3,
    'Industrial': 4,
    'Mega Projects': 5
}

def mapper(valeur):
    return correspondance.get(valeur, valeur)

y_pred = np.vectorize(mapper)(y_pred)

pred_df = pd.DataFrame(y_pred)
pred_df.to_csv("sample_submission1.csv", index=True, index_label='Id')


Majority voting

In [None]:
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

data = pd.read_excel('train_processed.xlsx')

X = data.drop(columns=['change_type'])
y = data['change_type']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

model1 = XGBClassifier(max_depth=8, n_estimators=1700, learning_rate=0.25)
model2 = XGBClassifier(max_depth=8, n_estimators=1695, learning_rate=0.24)
model3 = XGBClassifier(max_depth=8, n_estimators=1705, learning_rate=0.25)

voting_clf = VotingClassifier(estimators=[('model1', model1), ('model2', model2), ('model3', model3)], voting='soft')

voting_clf.fit(X, y)

new_data = pd.read_excel('test_processed.xlsx')

X_new = new_data

y_pred_encoded = voting_clf.predict(X_new)

y_pred = label_encoder.inverse_transform(y_pred_encoded.astype(int))

print("Predictions on the new data:")
print(y_pred)

correspondance = {
    'Demolition': 0,
    'Road': 1,
    'Residential': 2,
    'Commercial': 3,
    'Industrial': 4,
    'Mega Projects': 5
}

def mapper(valeur):
    return correspondance.get(valeur, valeur)

y_pred = np.vectorize(mapper)(y_pred)

pred_df = pd.DataFrame(y_pred)
pred_df.to_csv("sample_submission_voting2.csv", index=True, index_label='Id')


Grid Search (after having already narrowed down a few possibilities)

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder

data = pd.read_excel('train_processed.xlsx')

X = data.drop(columns=['change_type'])
y = data['change_type']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

param_grid = {
    'learning_rate': [0.25, 0.24, 0.26],
    'max_depth': [8],
    'n_estimators': [1700, 1695],
    'gamma': [0],
    'alpha': [0],
    'lambda': [0]
}

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax', num_class=len(set(y)), seed=42)

grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=3, scoring='f1_macro')
grid_search.fit(X_train, y_train)

print("Best parameters found:")
print(grid_search.best_params_)

best_xgb_classifier = grid_search.best_estimator_
y_pred = best_xgb_classifier.predict(X_test)

f1 = f1_score(y_test, y_pred, average='micro')
print("Model accuracy with best parameters:", f1)

print("Classification report:")
print(classification_report(y_test, y_pred))
