In [1]:
# Install necessary packages if not already installed
!pip install pandas numpy scikit-learn lightgbm matplotlib seaborn

# Basic imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb


Collecting lightgbm
  Using cached lightgbm-4.6.0-py3-none-win_amd64.whl.metadata (17 kB)
Using cached lightgbm-4.6.0-py3-none-win_amd64.whl (1.5 MB)
Installing collected packages: lightgbm
Successfully installed lightgbm-4.6.0


In [2]:
# Replace path with your actual downloaded files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)
train.head()

Train shape: (21454, 19)
Test shape: (7194, 18)


Unnamed: 0,ID,country,year,urban_or_rural,ghsl_water_surface,ghsl_built_pre_1975,ghsl_built_1975_to_1990,ghsl_built_1990_to_2000,ghsl_built_2000_to_2014,ghsl_not_built_up,ghsl_pop_density,landcover_crops_fraction,landcover_urban_fraction,landcover_water_permanent_10km_fraction,landcover_water_seasonal_10km_fraction,nighttime_lights,dist_to_capital,dist_to_shoreline,Target
0,ID_AAIethGy,Ethiopia,2016,R,0.0,0.0,0.0,5.5e-05,0.000536,0.999408,12.146134,25.489659,0.879484,0.0,0.0,0.0,278.788451,769.338378,0.132783
1,ID_AAYiaCeL,Ethiopia,2005,R,0.0,0.0,0.00011,0.0,1.8e-05,0.999872,113.806716,64.136053,0.601427,0.0,0.005427,0.0,200.986978,337.135243,0.004898
2,ID_AAdurmKj,Mozambique,2009,R,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.400096,0.1319,0.0,0.003078,0.0,642.594208,169.913773,0.09732
3,ID_AAgNHles,Malawi,2015,R,0.0,0.000141,0.000181,0.000254,0.000228,0.999195,5.21332,25.379371,2.017136,11.293841,0.131035,0.0,365.349451,613.59161,0.304107
4,ID_AAishfND,Guinea,2012,U,0.0,0.011649,0.01756,0.017383,0.099875,0.853533,31.734661,5.08162,22.815984,0.005047,0.130475,1.461894,222.867189,192.926363,0.605328


In [4]:
# Check for missing values
print(train.isnull().sum())

# Simple fill for missing data
train.fillna(-999, inplace=True)
test.fillna(-999, inplace=True)



ID                                         0
country                                    0
year                                       0
urban_or_rural                             0
ghsl_water_surface                         0
ghsl_built_pre_1975                        0
ghsl_built_1975_to_1990                    0
ghsl_built_1990_to_2000                    0
ghsl_built_2000_to_2014                    0
ghsl_not_built_up                          0
ghsl_pop_density                           0
landcover_crops_fraction                   0
landcover_urban_fraction                   0
landcover_water_permanent_10km_fraction    0
landcover_water_seasonal_10km_fraction     0
nighttime_lights                           0
dist_to_capital                            0
dist_to_shoreline                          0
Target                                     0
dtype: int64


In [5]:
print(train.columns)

Index(['ID', 'country', 'year', 'urban_or_rural', 'ghsl_water_surface',
       'ghsl_built_pre_1975', 'ghsl_built_1975_to_1990',
       'ghsl_built_1990_to_2000', 'ghsl_built_2000_to_2014',
       'ghsl_not_built_up', 'ghsl_pop_density', 'landcover_crops_fraction',
       'landcover_urban_fraction', 'landcover_water_permanent_10km_fraction',
       'landcover_water_seasonal_10km_fraction', 'nighttime_lights',
       'dist_to_capital', 'dist_to_shoreline', 'Target'],
      dtype='object')


In [6]:
categorical_columns = ['country']  # Modify this list based on your dataset

In [7]:
print(train.dtypes)

ID                                          object
country                                    float64
year                                         int64
urban_or_rural                              object
ghsl_water_surface                         float64
ghsl_built_pre_1975                        float64
ghsl_built_1975_to_1990                    float64
ghsl_built_1990_to_2000                    float64
ghsl_built_2000_to_2014                    float64
ghsl_not_built_up                          float64
ghsl_pop_density                           float64
landcover_crops_fraction                   float64
landcover_urban_fraction                   float64
landcover_water_permanent_10km_fraction    float64
landcover_water_seasonal_10km_fraction     float64
nighttime_lights                           float64
dist_to_capital                            float64
dist_to_shoreline                          float64
Target                                     float64
dtype: object


In [9]:
# Convert 'country' to string so it can be frequency encoded
train['country'] = train['country'].astype(str)
test['country'] = test['country'].astype(str)

# Frequency encoding
def frequency_encoding(df, column):
    freq = df[column].value_counts() / len(df)
    df[column] = df[column].map(freq)
    return df

# Now using the correct categorical columns
categorical_columns = ['country', 'urban_or_rural']
for col in categorical_columns:
    train = frequency_encoding(train, col)
    test = frequency_encoding(test, col)

In [12]:
target = 'Target'  # <- updated target name
features = [col for col in train.columns if col != target]

X = train[features]
y = train[target]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
# Drop 'ID' explicitly
features = [col for col in train.columns if col not in ['ID', 'Target']]
X = train[features]
y = train['Target']

In [23]:
grid.fit(X, y)

Fitting 3 folds for each of 16 candidates, totalling 48 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3612
[LightGBM] [Info] Number of data points in the train set: 21454, number of used features: 17
[LightGBM] [Info] Start training from score 0.350736


0,1,2
,estimator,LGBMRegressor(random_state=42)
,param_grid,"{'learning_rate': [0.01, 0.05], 'max_depth': [-1, 10], 'n_estimators': [100, 300], 'num_leaves': [31, 50]}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,3
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [26]:
# Define features without 'ID' and 'Target'
features = [col for col in train.columns if col not in ['ID', 'Target']]

# Create training and validation sets
X = train[features]
y = train['Target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
model = grid.best_estimator_
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001552 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3612
[LightGBM] [Info] Number of data points in the train set: 17163, number of used features: 17
[LightGBM] [Info] Start training from score 0.350549


0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [28]:
print(X.dtypes)

country                                    float64
year                                         int64
urban_or_rural                             float64
ghsl_water_surface                         float64
ghsl_built_pre_1975                        float64
ghsl_built_1975_to_1990                    float64
ghsl_built_1990_to_2000                    float64
ghsl_built_2000_to_2014                    float64
ghsl_not_built_up                          float64
ghsl_pop_density                           float64
landcover_crops_fraction                   float64
landcover_urban_fraction                   float64
landcover_water_permanent_10km_fraction    float64
landcover_water_seasonal_10km_fraction     float64
nighttime_lights                           float64
dist_to_capital                            float64
dist_to_shoreline                          float64
dtype: object


In [33]:
# Retrieve the best model from grid search
model = grid.best_estimator_

# Train on training set if you want to validate
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3612
[LightGBM] [Info] Number of data points in the train set: 17163, number of used features: 17
[LightGBM] [Info] Start training from score 0.350549


0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [32]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Compute RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Validation RMSE: {rmse}")

Validation RMSE: 0.08769179759242757


In [22]:
from sklearn.model_selection import GridSearchCV
from lightgbm import LGBMRegressor

# Define model
estimator = LGBMRegressor(random_state=42)

# Define parameter grid
param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [100, 300],
    'max_depth': [-1, 10]
}

# Run grid search
grid = GridSearchCV(
    estimator,
    param_grid,
    cv=3,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

In [30]:
from lightgbm import LGBMRegressor

model = LGBMRegressor(**grid.best_params_, random_state=42)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002196 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3612
[LightGBM] [Info] Number of data points in the train set: 17163, number of used features: 17
[LightGBM] [Info] Start training from score 0.350549


0,1,2
,boosting_type,'gbdt'
,num_leaves,50
,max_depth,-1
,learning_rate,0.05
,n_estimators,300
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [34]:
# Make sure you're using the same feature list from earlier
test_features = test[features]

# Predict using the best model
test_predictions = model.predict(test_features)

# Create submission DataFrame
submission = pd.DataFrame({
    'id': test['ID'],          # Assuming 'ID' is the identifier column
    'Target': test_predictions # This matches the target column used in training
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("Submission file saved as 'submission.csv'")

Submission file saved as 'submission.csv'


In [35]:
import joblib
joblib.dump(model, 'model.pkl')

['model.pkl']