<a href="https://colab.research.google.com/github/DariusTheGeek/3rd_place_solution_for_the__UmojaHack-3-Hotspots__zindi_hackathon/blob/master/3rd_place_solution_for_the__UmojaHack_3_Hotspots__zindi_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing necessary library

In [12]:
# Installing catboost
!pip install catboost



### Loading libraries

In [0]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import datetime as dt
from catboost import CatBoostRegressor, CatBoostClassifier
import re
from fastai.tabular import *
import warnings
warnings.filterwarnings('ignore')

### Mounting colab drive

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Loading data

In [0]:
# Loading files
train = pd.read_csv('/content/drive/My Drive/Hacck/train.csv', parse_dates=['date'])
test = pd.read_csv('/content/drive/My Drive/Hacck/test.csv', parse_dates=['date'])
ss = pd.read_csv('/content/drive/My Drive/Hacck/SampleSubmission.csv')

### Feature engineering functions

In [0]:
# Function to calculate bearing distance given latitude and longitude coordinates
def bearing_array(lat, lng):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lat - lng)
    lat, lng = map(np.radians, (lat, lng))
    y = np.sin(lng_delta_rad) * np.cos(lat)
    x = np.cos(lat) * np.sin(lat) - np.sin(lng) * np.cos(lng) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

# Function to calculate manhattan distance given latitude and longitude coordinates
def manhattan_distance(lat, lon):
    a = np.abs(lat -lon)
    return a

# Function to add date features
def add_datepart(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)

### Combining training and test data for efficiency

In [0]:
# Extracting the target variable
target = train.burn_area

# Creating a separator column
train['separator'] = 0
test['separator'] = 1

# Aligning the train and test sets
train, test = train.align(test, join = 'inner', axis = 1)

# Combining the train and test set for efficiency in generating features
comb = pd.concat([train, test])

### Feature Engineering

In [0]:
# Adding date features
add_datepart(comb, 'date', False)

# Adding cyclic date features
add_cyclic_datepart(comb, 'date')

# Adding the manhattan distance column
comb['manhat_dist'] = manhattan_distance(comb.lon.values, comb.lat.values)

# Adding the bearing distance column
comb['bearing_dist'] = bearing_array(comb.lat.values, comb.lon.values)

# Combining year and week to form a new feature
comb['woy'] = comb.Year*100+comb.Week

### Separating training and test data from the combined dataframe

In [0]:
# Separating the train and test set from the combined dataframe
train = comb[comb.separator == 0]
test = comb[comb.separator == 1]

# Dropping the separator column as it has served its purpose
train.drop('separator', axis = 1, inplace = True)
test.drop('separator', axis = 1, inplace = True)

### Engineering a 'burnt' feature

In [0]:
# Creating alist to store values of whether a location was burnt or not
burnt = []
for i in train.burn_area:
  if i <= 0:
    burnt.append(0)
  else:
    burnt.append(1)

# Separating predictor variables and dependant variable from other variables
X = train.drop(['ID', 'area', 'burn_area'], axis = 1)
y = burnt

# Selecting relevant variables in the test dataset
tes = test.drop(['ID', 'area'], axis = 1)

# Training a model to classify whether a region was burnt or not
catt = CatBoostClassifier(verbose = False)
catt.fit(X, y)

# Making predictions
predds = catt.predict(tes)

# Creating burnt columns from the predictions
train['burnt'] = burnt
test['burnt'] = predds

### Previewing a sample of the engineered dataframe

In [21]:
# Sampling 10 observations of the training set
train.sample(10)

Unnamed: 0,ID,area,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_swe,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,population_density,precipitation,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,weekday_cos,weekday_sin,day_month_cos,day_month_sin,month_year_cos,month_year_sin,day_year_cos,day_year_sin,manhat_dist,bearing_dist,woy,burnt
270777,3307_2006-03-01,3307,25.952,-8.036,0.0,1007.712922,0.0,-703.80778,1007.712922,188.492304,79.059539,2076.735712,1580.736308,0.0,192.978642,301.01217,2448.471304,82.079346,169.977568,689.517636,0.0,0.0,0.001537,0.0,0.998463,0.0,0.0,0.0,0.0,34.608861,0.467557,2006,3,9,1,2,60,False,True,False,False,False,False,1141171200,-0.222521,0.974928,1.0,0.0,0.5,0.8660254,0.527078,0.849817,33.988,44.681791,200609,0
156348,3508_2003-09-01,3508,23.748,-9.431,0.001973,291.061089,982.065605,-415.460304,1273.068338,22.809603,1.0,347.287022,2022.475279,0.0,155.310078,321.487045,1754.010102,154.175897,168.855122,1016.329299,0.0,0.0,0.827932,0.0,0.171511,0.0,0.000557,0.0,0.0,7.294566,0.038995,2003,9,36,1,0,244,False,True,False,False,False,False,1062374400,1.0,0.0,1.0,0.0,-0.5,-0.8660254,-0.504961,-0.863142,33.179,44.829601,200336,1
326028,1243_2007-06-01,1243,24.96,-0.265,0.0,906.496016,0.0,-391.267866,906.496016,120.785007,30.042733,379.618662,1629.036214,0.0,202.331241,300.607195,2548.85635,78.082327,99.398117,465.871915,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,5.717173,0.113551,2007,6,22,1,4,152,False,True,False,False,False,False,1180656000,-0.900969,-0.433884,1.0,0.0,-0.8660254,0.5,-0.856551,0.516062,25.225,44.971349,200722,0
514468,2454_2011-07-01,2454,14.449,-5.018,0.001844,422.441992,330.094541,244.127654,752.426155,0.0,0.0,1736.523815,1359.534493,0.0,176.288902,261.581524,2040.084943,67.597121,89.684103,454.175145,0.003487,0.0,0.615089,0.0,0.005234,0.0,0.37239,0.003799,0.0,52.223232,0.0,2011,7,26,1,4,182,False,True,False,True,False,False,1309478400,-0.900969,-0.433884,1.0,0.0,-1.0,1.224647e-16,-0.999667,0.025818,19.467,44.906459,201126,1
253464,1278_2005-11-01,1278,20.733,-0.492,0.0,1043.234741,0.0,-387.054162,1043.234741,126.191074,21.884077,210.517732,1933.456574,0.0,202.068034,299.404584,2658.503498,65.485404,101.115199,364.154071,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,13.187709,0.333445,2005,11,44,1,1,305,False,True,False,False,False,False,1130803200,0.62349,0.781831,1.0,0.0,0.5,-0.8660254,0.497513,-0.867456,21.225,44.957254,200544,0
566162,654_2012-09-01,654,18.495,1.99,0.0,1006.820976,0.0,-213.596383,1006.820976,164.01085,63.345992,1053.530802,1841.218324,0.0,200.168053,300.246293,2650.655214,65.975527,96.800362,337.129044,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,26.23133,0.224276,2012,9,35,1,5,245,False,True,False,False,False,False,1346457600,-0.222521,-0.974928,1.0,0.0,-0.5,-0.8660254,-0.5,-0.866025,16.505,45.198672,201235,0
561680,3814_2012-07-01,3814,29.508,-12.97,0.037794,427.341625,725.622246,419.650083,1153.00154,0.0,0.0,998.426558,2171.617863,0.0,74.485904,235.834044,1005.577233,97.253021,287.184435,1279.668032,0.0,0.0,0.018585,0.0,0.978322,0.0,0.003093,0.0,0.0,14.813756,0.000248,2012,7,26,1,6,183,False,True,False,True,False,False,1341100800,0.62349,-0.781831,1.0,0.0,-1.0,1.224647e-16,-0.999853,0.017166,42.478,44.890298,201226,1
84420,358_2002-03-01,358,28.282,3.26,0.0,1327.833715,0.0,-103.933486,1327.833715,281.195084,26.843114,1588.869261,2010.652609,0.0,207.677552,334.698518,2327.479937,150.04615,153.743343,744.527405,0.0,0.0,0.946496,0.0,0.052688,0.0,0.000307,0.0,0.000509,7.976398,0.548803,2002,3,9,1,4,60,False,True,False,False,False,False,1014940800,-0.900969,-0.433884,1.0,0.0,0.5,0.8660254,0.527078,0.849817,25.022,45.517678,200209,0
343760,3691_2007-10-01,3691,26.709,-10.675,0.0,495.901052,1143.772905,-196.140442,1639.629152,48.690271,2.004256,232.469086,2277.252985,0.0,171.480435,327.610356,1817.043149,165.855893,281.038066,1160.971527,0.0,0.0,0.006762,0.0,0.982831,0.0,0.00672,0.0,0.003687,29.302918,0.055937,2007,10,40,1,0,274,False,True,False,True,False,False,1191196800,1.0,0.0,1.0,0.0,-1.83697e-16,-1.0,-0.01291,-0.999917,37.384,44.801505,200740,0
354813,3281_2008-01-01,3281,28.824,-7.729,0.0,931.548816,0.0,551.506102,931.548816,311.971165,218.765614,2096.572625,1486.968174,0.0,173.384542,267.828667,2040.46961,72.136037,164.786671,1210.637316,0.0,0.0,0.391205,0.0,0.608795,0.0,0.0,0.0,0.0,17.482972,0.300583,2008,1,1,1,1,1,False,True,False,True,False,True,1199145600,0.62349,0.781831,1.0,0.0,1.0,0.0,1.0,0.0,36.553,44.582326,200801,0


In [22]:
# Sampling 10 observations of the test set
test.sample(10)

Unnamed: 0,ID,area,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_swe,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,population_density,precipitation,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,weekday_cos,weekday_sin,day_month_cos,day_month_sin,month_year_cos,month_year_sin,day_year_cos,day_year_sin,manhat_dist,bearing_dist,woy,burnt
108623,1635_2016-05-01,1635,24.3,-1.827,0.0,749.160198,370.159113,-1082.814489,1119.132594,59.422372,3.0,107.588597,1983.788693,0.0,196.635246,310.160439,2616.915019,79.114513,126.513259,511.265671,0.0,0.0,0.999385,0.0,0.0,0.0,0.000615,0.0,0.0,6.812218,0.151996,2016,5,17,1,6,122,False,True,False,False,False,False,1462060800,0.62349,-0.781831,1.0,0.0,-0.5,0.866025,-0.48506,0.874481,26.127,44.835061,201617,0
108008,1020_2016-05-01,1020,23.852,0.629,0.0,1027.284528,62.006737,-946.921319,1089.291627,97.393527,5.0,391.12512,1894.148821,0.0,210.881376,312.153032,2698.663499,84.209937,103.054139,433.74423,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,31.325463,0.336034,2016,5,17,1,6,122,False,True,False,False,False,False,1462060800,0.62349,-0.781831,1.0,0.0,-0.5,0.866025,-0.48506,0.874481,23.223,45.070209,201617,0
119438,987_2016-08-01,987,28.48,0.838,0.0,879.795921,224.465725,-1221.258388,1104.333575,82.444123,4.0,421.62853,1906.916968,0.0,184.53874,288.139512,2155.227251,90.126358,140.152788,848.039207,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.510859,0.233692,2016,8,31,1,0,214,False,True,False,False,False,False,1470009600,1.0,0.0,1.0,0.0,-0.8660254,-0.5,-0.870285,-0.492548,27.642,45.113335,201631,0
130201,287_2016-11-01,287,24.979,3.522,0.0,1151.773478,104.440747,-1323.674623,1256.214105,102.370102,5.0,1437.185895,2458.224473,0.0,190.143822,297.842074,2435.837251,77.343219,126.508379,582.15039,0.0,0.0,0.997541,0.0,0.0,0.0,0.002459,0.0,0.0,6.692425,0.213593,2016,11,44,1,1,306,False,True,False,False,False,False,1477958400,0.62349,0.781831,1.0,0.0,0.5,-0.866025,0.5,-0.866025,21.457,45.511328,201644,0
13158,1695_2014-04-01,1695,24.745,-2.048,0.0,1021.816942,0.0,-1057.534331,1021.816942,170.808375,66.822734,376.997949,1872.76795,0.0,189.464462,298.778328,2579.634126,63.669724,130.274768,555.039387,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.280217,0.151169,2014,4,14,1,1,91,False,True,False,True,False,False,1396310400,0.62349,0.781831,1.0,0.0,6.123234000000001e-17,1.0,0.021516,0.999769,26.793,44.815349,201414,0
30463,3716_2014-08-01,3716,26.718,-10.896,0.0,145.663296,1240.648424,410.4232,1386.350268,0.0,0.0,440.724331,2093.489827,0.0,104.645925,289.760143,1154.333849,148.91862,260.728733,1285.318526,0.0,0.0,0.00023,0.0,0.738518,0.015974,0.188866,0.0,0.056412,502.615908,0.002854,2014,8,31,1,4,213,False,True,False,False,False,False,1406851200,-0.900969,-0.433884,1.0,0.0,-0.8660254,-0.5,-0.873807,-0.486273,37.614,44.818791,201431,1
131450,1536_2016-11-01,1536,28.045,-1.363,0.0,1119.747595,0.0,-1361.803271,1119.747595,159.788721,8.0,572.51455,2073.540164,0.0,173.921236,290.111953,2268.056037,74.366402,156.22583,803.088301,0.0,0.0,0.991701,0.0,0.0,0.0,0.008299,0.0,0.0,17.63198,0.343596,2016,11,44,1,1,306,False,True,False,False,False,False,1477958400,0.62349,0.781831,1.0,0.0,0.5,-0.866025,0.5,-0.866025,29.408,44.848053,201644,0
11606,143_2014-04-01,143,28.295,4.141,0.0,1183.209395,0.0,-964.825565,1183.209395,141.442696,7.0,415.200865,2167.023306,0.0,183.236185,299.157857,2405.289885,77.416386,156.342263,727.207485,0.0,0.0,0.740543,0.0,0.259457,0.0,0.0,0.0,0.0,4.951406,0.141591,2014,4,14,1,1,91,False,True,False,True,False,False,1396310400,0.62349,0.781831,1.0,0.0,6.123234000000001e-17,1.0,0.021516,0.999769,24.154,45.695369,201414,0
4103,282_2014-02-01,282,23.869,3.534,0.0,193.040689,1172.438425,-1115.444444,1365.393885,4.42711,0.0,407.408571,2605.506801,0.0,204.880101,335.561213,2195.373179,162.583845,120.282172,477.316821,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.322342,0.131928,2014,2,5,1,5,32,False,True,False,False,False,False,1391212800,-0.222521,-0.974928,1.0,0.0,0.8660254,0.5,0.860961,0.508671,20.335,45.494423,201405,0
103084,3738_2016-03-01,3738,27.398,-11.09,0.0,974.708407,0.0,154.030476,974.708407,217.959682,120.596466,2557.519981,1675.212736,0.0,175.681252,284.045417,2321.680066,63.023479,125.550219,1218.541251,0.0,0.0,0.031957,0.0,0.947677,0.0,0.020365,0.0,0.0,36.348762,0.153308,2016,3,9,1,1,61,False,True,False,False,False,False,1456790400,0.62349,0.781831,1.0,0.0,0.5,0.866025,0.514793,0.857315,38.488,44.80579,201609,0


### Training and making predictions

In [0]:
X = train.drop(['ID', 'area', 'burn_area'], axis = 1)
y = train.burn_area

tes = test.drop(['ID', 'area'], axis = 1)

predictions = []
for i in range(10):
  # Training the model with different seeds
  cat = CatBoostRegressor(verbose = False, depth = 9, iterations = 1500, random_seed = i)
  cat.fit(X, y)

  # Making predictions
  preds = cat.predict(tes)
  predictions.append(preds)

# Averaging the preictions
preds = np.mean(predictions, axis = 0)

# Creating submission file
sub_df = pd.DataFrame({'ID': test.ID, 'Prediction': preds})
sub_df.to_csv('submission.csv', index = False)