# Classification of Geospatial Data

### CHAFIK Hala  | EL OMARI Chaimae  |  DEMRI Lina

In [1]:
#Import libraries

import geopandas as gpd
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score


In [None]:
## Read GeoJson files

train_df = gpd.read_file('train.geojson', index_col=0)
test_df = gpd.read_file('test.geojson', index_col=0)

##Convert GeoJson files into CSV files

train_df.to_csv('train.csv', index=False)
test_df.to_csv('test.csv', index=False)

In [2]:
main_train_df=pd.read_csv('train.csv')

main_test_df=pd.read_csv('test.csv')


In [3]:
train_df=main_train_df.copy()
test_df=main_test_df.copy()


Index(['urban_type', 'geography_type', 'img_red_mean_date1',
       'img_green_mean_date1', 'img_blue_mean_date1', 'img_red_std_date1',
       'img_green_std_date1', 'img_blue_std_date1', 'img_red_mean_date2',
       'img_green_mean_date2', 'img_blue_mean_date2', 'img_red_std_date2',
       'img_green_std_date2', 'img_blue_std_date2', 'img_red_mean_date3',
       'img_green_mean_date3', 'img_blue_mean_date3', 'img_red_std_date3',
       'img_green_std_date3', 'img_blue_std_date3', 'img_red_mean_date4',
       'img_green_mean_date4', 'img_blue_mean_date4', 'img_red_std_date4',
       'img_green_std_date4', 'img_blue_std_date4', 'img_red_mean_date5',
       'img_green_mean_date5', 'img_blue_mean_date5', 'img_red_std_date5',
       'img_green_std_date5', 'img_blue_std_date5', 'date0',
       'change_status_date0', 'date1', 'change_status_date1', 'date2',
       'change_status_date2', 'date3', 'change_status_date3', 'date4',
       'change_status_date4', 'index', 'geometry'],
      dtype='

In [5]:
train_df.head(2)

Unnamed: 0,urban_type,geography_type,change_type,img_red_mean_date1,img_green_mean_date1,img_blue_mean_date1,img_red_std_date1,img_green_std_date1,img_blue_std_date1,img_red_mean_date2,...,date1,change_status_date1,date2,change_status_date2,date3,change_status_date3,date4,change_status_date4,index,geometry
0,Sparse Urban,"Dense Forest,Grass Land",Road,93.371775,107.291113,89.827379,29.81204,28.328368,25.324294,125.773062,...,09-12-2013,Greenland,10-09-2016,Construction Started,22-07-2019,Construction Done,24-07-2017,Construction Midway,0,"POLYGON ((112.16774086470313 32.0219772550438,..."
1,Sparse Urban,"Dense Forest,Grass Land",Road,96.071674,107.061702,90.755556,24.89624,22.27518,22.080686,133.097679,...,09-12-2013,Greenland,10-09-2016,Land Cleared,22-07-2019,Construction Done,24-07-2017,Construction Midway,1,POLYGON ((112.16848748857684 32.02047741874698...


## I. Data Preprocessing and Feature Engineering

##### Handling urban and geography type through one-hot encoding

In [4]:
split_geography_type = train_df['geography_type'].str.split(',', expand=True)  #seeing that there exists many geography type for the same feature,
one_hot_encoded_geo = pd.get_dummies(split_geography_type.stack(), prefix='geography_type',dtype=int).groupby(level=0).max() # for the one hot encoding to work, we must split the data on ',' character.

one_hot_encoded_geo=one_hot_encoded_geo.drop(columns=['geography_type_A'])
one_hot_encoded_geo=one_hot_encoded_geo.drop(columns=['geography_type_N'])


train_df=pd.concat([train_df,one_hot_encoded_geo],axis=1)

split_urban_type=train_df['urban_type'].str.split(',',expand=True)
one_hot_encoded_urb= pd.get_dummies(split_urban_type.stack(), prefix='urban_type',dtype=int).groupby(level=0).max()

one_hot_encoded_urb=one_hot_encoded_urb.drop(columns=['urban_type_A'])
one_hot_encoded_urb=one_hot_encoded_urb.drop(columns=['urban_type_N'])

train_df=pd.concat([train_df,one_hot_encoded_urb],axis=1)

train_df.drop('geography_type',axis=1,inplace=True)
train_df.drop('urban_type',axis=1,inplace=True)

train_df.head()

Unnamed: 0,change_type,img_red_mean_date1,img_green_mean_date1,img_blue_mean_date1,img_red_std_date1,img_green_std_date1,img_blue_std_date1,img_red_mean_date2,img_green_mean_date2,img_blue_mean_date2,...,geography_type_Hills,geography_type_Lakes,geography_type_River,geography_type_Snow,geography_type_Sparse Forest,urban_type_Dense Urban,urban_type_Industrial,urban_type_Rural,urban_type_Sparse Urban,urban_type_Urban Slum
0,Road,93.371775,107.291113,89.827379,29.81204,28.328368,25.324294,125.773062,139.833243,134.900701,...,0,0,0,0,0,0,0,0,1,0
1,Road,96.071674,107.061702,90.755556,24.89624,22.27518,22.080686,133.097679,145.38519,137.092518,...,0,0,0,0,0,0,0,0,1,0
2,Road,101.212148,113.462178,95.670574,24.179684,21.873401,21.285197,120.71349,131.633447,124.436492,...,0,0,0,0,0,0,0,0,1,0
3,Road,94.463311,99.995531,84.470046,26.869852,23.767679,19.351983,114.819776,127.827828,120.435373,...,0,0,0,0,0,0,0,1,0,0
4,Demolition,151.883646,191.710197,211.569244,52.465332,59.441844,52.304349,141.514462,171.079581,181.960612,...,0,0,0,0,1,1,0,0,0,0


In [5]:

split_geography_type = test_df['geography_type'].str.split(',', expand=True)
one_hot_encoded_geo = pd.get_dummies(split_geography_type.stack(), prefix='geography_type',dtype=int).groupby(level=0).max()

if 'geography_type_A' in one_hot_encoded_geo.columns:
    one_hot_encoded_geo=one_hot_encoded_geo.drop(columns=['geography_type_A'])
if 'geography_type_N' in one_hot_encoded_geo.columns:  
    one_hot_encoded_geo=one_hot_encoded_geo.drop(columns=['geography_type_N'])


test_df=pd.concat([test_df,one_hot_encoded_geo],axis=1)

split_urban_type=test_df['urban_type'].str.split(',',expand=True)
one_hot_encoded_urb= pd.get_dummies(split_urban_type.stack(), prefix='urban_type',dtype=int).groupby(level=0).max()
if 'urban_type_A' in one_hot_encoded_urb.columns:
    one_hot_encoded_urb=one_hot_encoded_urb.drop(columns=['urban_type_A'])
if 'urban_type_N' in one_hot_encoded_urb.columns:
    one_hot_encoded_urb=one_hot_encoded_urb.drop(columns=['urban_type_N'])

test_df=pd.concat([test_df,one_hot_encoded_urb],axis=1)

test_df.drop('geography_type',axis=1,inplace=True)
test_df.drop('urban_type',axis=1,inplace=True)



##### Ordering the dates

First, we start by creating a data frame contaning the date related columns.

In [6]:
date_df = pd.DataFrame()
                      
date_columns = ['date0','date1', 'date2', 'date3', 'date4' ]
change_status_columns=['change_status_date0','change_status_date1', 'change_status_date2', 'change_status_date3', 'change_status_date4' ]
mean_color_columns = [col for col in train_df.columns if '_mean_date' in col ]
std_color_columns = [col for col in train_df.columns if '_std_date' in col]

# Convert date columns to datetime format
date_format = "%d-%m-%Y"
train_df[date_columns] = train_df[date_columns].apply(lambda x: pd.to_datetime(x, format=date_format))

date_status_dft = pd.DataFrame()
dft=train_df[date_columns+change_status_columns+mean_color_columns+std_color_columns]

dft

Unnamed: 0,date0,date1,date2,date3,date4,change_status_date0,change_status_date1,change_status_date2,change_status_date3,change_status_date4,...,img_blue_std_date2,img_red_std_date3,img_green_std_date3,img_blue_std_date3,img_red_std_date4,img_green_std_date4,img_blue_std_date4,img_red_std_date5,img_green_std_date5,img_blue_std_date5
0,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24,Construction Done,Greenland,Construction Started,Construction Done,Construction Midway,...,25.008032,55.745311,47.576383,42.723218,39.819949,30.864230,28.189604,33.813160,33.064014,34.818012
1,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24,Construction Midway,Greenland,Land Cleared,Construction Done,Construction Midway,...,20.271657,42.130924,38.138137,35.142246,37.129531,28.089549,25.901238,30.670052,33.258905,36.139281
2,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24,Construction Done,Greenland,Land Cleared,Construction Done,Land Cleared,...,22.505835,58.434034,48.106798,43.902810,42.510625,33.189102,30.522871,34.436045,33.546000,35.545531
3,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24,Construction Midway,Greenland,Construction Started,Construction Done,Construction Midway,...,23.901639,51.046935,44.444291,40.558345,39.111827,30.762028,28.337671,31.285068,32.613142,36.080342
4,2018-08-01,2013-12-09,2016-09-10,2019-07-22,2017-07-24,Prior Construction,Prior Construction,Prior Construction,Land Cleared,Prior Construction,...,41.600845,32.054249,40.833531,48.325621,27.578335,28.158172,27.306766,40.627077,37.989182,37.177376
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296141,2014-11-19,2017-02-25,2014-01-27,2018-03-28,2015-12-28,Greenland,Construction Done,Land Cleared,Construction Done,Land Cleared,...,18.072115,22.149322,21.775425,21.220923,31.735119,27.878950,22.753705,39.430303,31.233005,32.695174
296142,2014-11-19,2017-02-25,2014-01-27,2018-03-28,2015-12-28,Greenland,Construction Done,Greenland,Construction Done,Land Cleared,...,21.063869,36.244103,44.609578,49.829084,52.820758,56.492413,58.892309,57.208272,55.111372,57.120977
296143,2014-11-19,2017-02-25,2014-01-27,2018-03-28,2015-12-28,Greenland,Construction Done,Greenland,Construction Done,Greenland,...,11.146906,16.453913,13.275921,11.146906,13.538151,11.347441,11.297618,27.654293,26.528809,25.034740
296144,2014-11-19,2017-02-25,2014-01-27,2018-03-28,2015-12-28,Land Cleared,Land Cleared,Land Cleared,Construction Midway,Land Cleared,...,22.268071,29.719189,26.584532,22.268071,23.167710,22.728428,22.132447,58.078044,54.960466,53.635703


In [7]:
date_df = pd.DataFrame()
                      
date_columns = ['date0','date1', 'date2', 'date3', 'date4' ]
change_status_columns=['change_status_date0','change_status_date1', 'change_status_date2', 'change_status_date3', 'change_status_date4' ]
mean_color_columns = [col for col in test_df.columns if '_mean_date' in col ]
std_color_columns = [col for col in test_df.columns if '_std_date' in col]

# Convert date columns to datetime format
date_format = "%d-%m-%Y"
test_df[date_columns] = test_df[date_columns].apply(lambda x: pd.to_datetime(x, format=date_format, errors='coerce'))

date_status_df = pd.DataFrame()
df=test_df[date_columns+change_status_columns+mean_color_columns+std_color_columns]


Index(['date0', 'date1', 'date2', 'date3', 'date4', 'change_status_date0',
       'change_status_date1', 'change_status_date2', 'change_status_date3',
       'change_status_date4', 'img_red_mean_date1', 'img_green_mean_date1',
       'img_blue_mean_date1', 'img_red_mean_date2', 'img_green_mean_date2',
       'img_blue_mean_date2', 'img_red_mean_date3', 'img_green_mean_date3',
       'img_blue_mean_date3', 'img_red_mean_date4', 'img_green_mean_date4',
       'img_blue_mean_date4', 'img_red_mean_date5', 'img_green_mean_date5',
       'img_blue_mean_date5', 'img_red_std_date1', 'img_green_std_date1',
       'img_blue_std_date1', 'img_red_std_date2', 'img_green_std_date2',
       'img_blue_std_date2', 'img_red_std_date3', 'img_green_std_date3',
       'img_blue_std_date3', 'img_red_std_date4', 'img_green_std_date4',
       'img_blue_std_date4', 'img_red_std_date5', 'img_green_std_date5',
       'img_blue_std_date5'],
      dtype='object')


In [8]:
for i in range(len(date_columns)):
    date_col = dft.columns[i]
    
    status_col = dft.columns[i + 5]
    mean_red_col='img_red_mean_date'+str(i+1)
    mean_blue_col='img_blue_mean_date'+str(i+1)
    mean_green_col='img_green_mean_date'+str(i+1)
    std_red_col='img_red_std_date'+str(i+1)
    std_blue_col='img_blue_std_date'+str(i+1)
    std_green_col='img_green_std_date'+str(i+1)

    date_status_dft[f'date_status_{i}'] = tuple(zip(dft[date_col].dt.date, dft[status_col],dft[mean_red_col],dft[mean_blue_col],dft[mean_green_col],dft[std_red_col],dft[std_blue_col],dft[std_green_col]))
def custom_sort(row):
    return sorted(row, key=lambda x: (x[0] if not pd.isna(x[0]) else pd.Timestamp.max.date(),))

date_status_dft[date_status_dft.columns] = date_status_dft[date_status_dft.columns].apply(custom_sort, axis=1, result_type='expand')

for i, date_col in enumerate(date_columns):
    date_status_dft[f'{date_col}'] = date_status_dft[f'date_status_{i}'].str[0]
    date_status_dft[f'change_status_{date_col}'] = date_status_dft[f'date_status_{i}'].str[1]
    date_status_dft[f'img_red_mean_{date_col}'] = date_status_dft[f'date_status_{i}'].str[2]
    date_status_dft[f'img_blue_mean_{date_col}'] = date_status_dft[f'date_status_{i}'].str[3]
    date_status_dft[f'img_green_mean_{date_col}'] = date_status_dft[f'date_status_{i}'].str[4]
    date_status_dft[f'img_red_std_{date_col}'] = date_status_dft[f'date_status_{i}'].str[5]
    date_status_dft[f'img_blue_std_{date_col}'] = date_status_dft[f'date_status_{i}'].str[6]
    date_status_dft[f'img_green_std_{date_col}'] = date_status_dft[f'date_status_{i}'].str[7]
date_status_dft = date_status_dft.drop(columns=[f'date_status_{i}' for i in range(len(date_columns))])

train_df.drop(date_columns,axis=1,inplace=True)
train_df.drop(change_status_columns,axis=1,inplace=True)
train_df.drop(mean_color_columns,axis=1,inplace=True)
train_df.drop(std_color_columns,axis=1,inplace=True)
train_df=pd.concat([train_df,date_status_dft],axis=1)

In [9]:
for i in range(len(date_columns)):
    date_col = df.columns[i]
    
    status_col = df.columns[i + 5]
    mean_red_col='img_red_mean_date'+str(i+1)
    mean_blue_col='img_blue_mean_date'+str(i+1)
    mean_green_col='img_green_mean_date'+str(i+1)
    std_red_col='img_red_std_date'+str(i+1)
    std_blue_col='img_blue_std_date'+str(i+1)
    std_green_col='img_green_std_date'+str(i+1)

    date_status_df[f'date_status_{i}'] = tuple(zip(df[date_col].dt.date, df[status_col],df[mean_red_col],df[mean_blue_col],df[mean_green_col],df[std_red_col],df[std_blue_col],df[std_green_col]))

def custom_sort(row):
    return sorted(row, key=lambda x: (x[0] if not pd.isna(x[0]) else pd.Timestamp.max.date(),))


date_status_df[date_status_df.columns] = date_status_df[date_status_df.columns].apply(custom_sort, axis=1, result_type='expand')

for i, date_col in enumerate(date_columns):
    date_status_df[f'{date_col}'] = date_status_df[f'date_status_{i}'].str[0]
    date_status_df[f'change_status_{date_col}'] = date_status_df[f'date_status_{i}'].str[1]
    date_status_df[f'img_red_mean_{date_col}'] = date_status_df[f'date_status_{i}'].str[2]
    date_status_df[f'img_blue_mean_{date_col}'] = date_status_df[f'date_status_{i}'].str[3]
    date_status_df[f'img_green_mean_{date_col}'] = date_status_df[f'date_status_{i}'].str[4]
    date_status_df[f'img_red_std_{date_col}'] = date_status_df[f'date_status_{i}'].str[5]
    date_status_df[f'img_blue_std_{date_col}'] = date_status_df[f'date_status_{i}'].str[6]
    date_status_df[f'img_green_std_{date_col}'] = date_status_df[f'date_status_{i}'].str[7]
date_status_df = date_status_df.drop(columns=[f'date_status_{i}' for i in range(len(date_columns))])

test_df.drop(date_columns,axis=1,inplace=True)
test_df.drop(change_status_columns,axis=1,inplace=True)
test_df.drop(mean_color_columns,axis=1,inplace=True)
test_df.drop(std_color_columns,axis=1,inplace=True)
test_df=pd.concat([test_df,date_status_df],axis=1)

##### Creating new temporal features

We start by introducing the number of days between two consecutive dates, acknowledging the
construction pace variability across different geographic zones.

In [10]:
date_dft = pd.DataFrame()
                      

for i in range(len(date_columns)-1):
    try:
        date_dft[f'days_between_{i}_{i+1}'] = ((pd.to_datetime(train_df[date_columns[i+1]], format=date_format, errors='coerce') - pd.to_datetime(train_df[date_columns[i]], format=date_format, errors='coerce')).dt.days )/365
    except:
        date_dft[f'days_between_{i}_{i+1}']=None



In [11]:
for i in range(len(date_columns)-1):
    try:
        date_df[f'days_between_{i}_{i+1}'] = ((pd.to_datetime(test_df[date_columns[i+1]], format=date_format, errors='coerce') - pd.to_datetime(test_df[date_columns[i]], format=date_format, errors='coerce')).dt.days) /365
    except:
        date_df[f'days_between_{i}_{i+1}']=None



We also computed the difference between ’date0’ and a reference date for each datapoint

In [13]:
date_ref = pd.to_datetime(pd.Series(['01-01-2000'] * len(train_df)), format='%d-%m-%Y')
try:
    train_df['t0'] = (pd.to_datetime(train_df['date0'],format=date_format,errors='coerce')- date_ref).dt.days/365
except:
    train_df['t0']=None

In [14]:
date_ref = pd.to_datetime(pd.Series(['01-01-2000'] * len(train_df)), format='%d-%m-%Y')
try:
    test_df['t0'] = (pd.to_datetime(test_df['date0'],format=date_format,errors='coerce')- date_ref).dt.days/365

except:
    test_df['t0'] =None


0    14.194521
1    14.194521
2    14.194521
3    14.194521
4    14.194521
5    14.194521
6    14.194521
7    14.194521
8    14.194521
9    14.194521
Name: t0, dtype: float64

Accordingly, we decided to drop the dates columns as they no longer retained significance for
our analysis.

In [12]:
for i in range(5):
    train_df.drop('date'+str(i),axis=1,inplace=True)
train_df=pd.concat([train_df,date_dft],axis=1)

In [13]:
for i in range(5):
    test_df.drop('date'+str(i),axis=1,inplace=True)
test_df=pd.concat([test_df,date_df],axis=1)

##### Introducing geometric features

In [14]:
from shapely.geometry import LineString
from shapely.geometry import Polygon , MultiPolygon
from shapely.wkt import loads

# Function to calculate the geometric features

def calculate_geometry(geometry):    
    polygon = loads(geometry)  
    try:
        area = polygon.area
        perimeter = polygon.length
        num_edges = len(polygon.exterior.coords) - 1  
        longest_diagonal = LineString(MultiPolygon([polygon]).minimum_rotated_rectangle.exterior.coords)
        shortest_diagonal = LineString(MultiPolygon([polygon]).convex_hull.exterior.coords)
        aspect_ratio = longest_diagonal.length / shortest_diagonal.length
        ratioAP=area/perimeter**2
        min_x, min_y, max_x, max_y = polygon.bounds
        width = max_x - min_x
        height = max_y - min_y
    except:
        area = None
        perimeter = None
        num_edges = None
        aspect_ratio=None
        ratioAP=None
        width=None
        height=None
    return area, perimeter, num_edges,aspect_ratio,ratioAP,width,height

# Apply the function to 'geometry' column to create area column, perimeter column, etc
train_df[['area','perimeter','edges','aspect_ratio','ratioAP','width','height']] = train_df['geometry'].apply(lambda x: pd.Series(calculate_geometry(x)))
test_df[['area','perimeter','edges','aspect_ratio','ratioAP','width','height']] = test_df['geometry'].apply(lambda x: pd.Series(calculate_geometry(x)))


In [15]:
#Discarding geometry column
train_df.drop('geometry',axis=1,inplace=True)
test_df.drop('geometry',axis=1,inplace=True)

##### Converting change status columns from strings to numerical values using Label Encoding

We ordered the change status values in a coherent way, so the label encoding holds.

In [16]:
changestatus_values = ['Land Cleared','Greenland','Excavation','Materials Introduced'  ,'Materials Dumped','Prior Construction','Construction Started','Construction Midway','Construction Done','Operational']
encoding_change_status_order={}
for i in range(len(changestatus_values)): 
    encoding_change_status_order[changestatus_values[i]]=i

for i in range(5):
    train_df['change_status_date'+str(i)]=train_df['change_status_date'+str(i)].map(encoding_change_status_order)


In [17]:
changestatus_values = ['Land Cleared','Greenland','Excavation','Materials Introduced'  ,'Materials Dumped','Prior Construction','Construction Started','Construction Midway','Construction Done','Operational']
encoding_change_status_order={}
for i in range(len(changestatus_values)): 
    encoding_change_status_order[changestatus_values[i]]=i

for i in range(5):
    test_df['change_status_date'+str(i)]=test_df['change_status_date'+str(i)].map(encoding_change_status_order)


In [36]:
train_df.columns

Index(['geography_type_Barren Land', 'geography_type_Coastal',
       'geography_type_Dense Forest', 'geography_type_Desert',
       'geography_type_Farms', 'geography_type_Grass Land',
       'geography_type_Hills', 'geography_type_Lakes', 'geography_type_River',
       'geography_type_Snow', 'geography_type_Sparse Forest',
       'urban_type_Dense Urban', 'urban_type_Industrial', 'urban_type_Rural',
       'urban_type_Sparse Urban', 'urban_type_Urban Slum',
       'change_status_date0', 'img_red_mean_date0', 'img_blue_mean_date0',
       'img_green_mean_date0', 'img_red_std_date0', 'img_blue_std_date0',
       'img_green_std_date0', 'change_status_date1', 'img_red_mean_date1',
       'img_blue_mean_date1', 'img_green_mean_date1', 'img_red_std_date1',
       'img_blue_std_date1', 'img_green_std_date1', 'change_status_date2',
       'img_red_mean_date2', 'img_blue_mean_date2', 'img_green_mean_date2',
       'img_red_std_date2', 'img_blue_std_date2', 'img_green_std_date2',
       'chang

In [18]:
#dropping index column since it is of no use.

train_df.drop('index',axis=1,inplace=True)
test_df.drop('index',axis=1,inplace=True)

In [19]:
change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4,
       'Mega Projects': 5}

y = train_df['change_type'].apply(lambda x: change_type_map[x])

train_df.drop('change_type',inplace=True,axis=1)
X=train_df

##### Handling missing values by imputing the corresponding mean

In [20]:
means_train = X.mean()
X = train_df.fillna(means_train)
test_df = test_df.fillna(means_train)

# II. Model tuning and Comparison

- ### k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5) 

# Train the classifier
knn.fit(X, y)

predictions = knn.predict(test_df)


In [None]:
pred_df = pd.DataFrame(predictions, columns=['change_type'])
pred_df.to_csv("KNN_submissions.csv", index=True, index_label='Id')

- ### Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf_classifier = RandomForestClassifier(n_estimators=205,random_state=42,max_depth=45)
rf_classifier.fit(X, y)

  return fit_method(estimator, *args, **kwargs)


In [None]:
X_test=test_df
y_pred = rf_classifier.predict(X_test)
pred_df = pd.DataFrame(y_pred, columns=['change_type'])
pred_df.to_csv("random_forest_submissions.csv", index=True, index_label='Id')

- ### XGBoost

In [11]:
from xgboost import XGBClassifier
xgb_classifier = XGBClassifier(n_estimators=4500,learning_rate=0.1)
xgb_classifier.fit(X,y)


In [None]:
X_test=test_df
y_pred = xgb_classifier.predict(X_test)
pred_df = pd.DataFrame(y_pred, columns=['change_type'])
pred_df.to_csv("XGB_submissions.csv", index=True, index_label='Id')

##### Evaluation using f1_macro and accuracy metrics

In [12]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(xgb_classifier,X, y, cv=5,scoring='f1_macro') 
print("Cross-Validation Scores:", scores)
mean_score = scores.mean()
print("Mean F1 Score:", mean_score)

Cross-Validation Scores: [0.54171872 0.62077698 0.53274578 0.60747269 0.63727228]
Mean F1 Score: 0.5879972928169523


In [None]:
scores2 = cross_val_score(xgb_classifier,X, y, cv=5,scoring='accuracy') 
print("Cross-Validation Scores:", scores)
mean_score2 = scores2.mean()
print("Accuracy Score:", mean_score2)

- ### Logistic Regression

In [21]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test_df)

# Initialize the Logistic Regression model with regularization and increased max_iter
logreg_model = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=2000, random_state=42)

# Train the model
logreg_model.fit(X_train_scaled, y)


  y = column_or_1d(y, warn=True)


In [None]:
scores = cross_val_score(logreg_model,X, y, cv=5,scoring='accuracy')
print("Cross-Validation Scores:", scores)
mean_score = scores.mean()
print("Mean accuracy:", mean_score)
scores2 = cross_val_score(logreg_model,X, y, cv=5,scoring='f1_macro')
print("Cross-Validation Scores:", scores2)
mean_score2 = scores2.mean()
print("Mean F1 Score:", mean_score2)

In [22]:
y_pred = logreg_model.predict(X_test_scaled)
pred_df = pd.DataFrame(y_pred, columns=['change_type'])
pred_df.to_csv("logreg_sumbmissions.csv", index=True, index_label='Id')