<a href="https://colab.research.google.com/github/DariusTheGeek/3rd_place_solution_for_the__UmojaHack-3-Hotspots__zindi_hackathon/blob/master/3rd_place_solution_for_the__UmojaHack_3_Hotspots__zindi_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing necessary library

In [12]:
# Installing catboost
!pip install catboost



### Loading libraries

In [0]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import datetime as dt
from catboost import CatBoostRegressor, CatBoostClassifier
import re
from fastai.tabular import *
import warnings
warnings.filterwarnings('ignore')

### Mounting colab drive

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Loading data

In [0]:
# Loading files
train = pd.read_csv('/content/drive/My Drive/Hacck/train.csv', parse_dates=['date'])
test = pd.read_csv('/content/drive/My Drive/Hacck/test.csv', parse_dates=['date'])
ss = pd.read_csv('/content/drive/My Drive/Hacck/SampleSubmission.csv')

### Feature engineering functions

In [0]:
# Function to calculate bearing distance given latitude and longitude coordinates
def bearing_array(lat, lng):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lat - lng)
    lat, lng = map(np.radians, (lat, lng))
    y = np.sin(lng_delta_rad) * np.cos(lat)
    x = np.cos(lat) * np.sin(lat) - np.sin(lng) * np.cos(lng) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

# Function to calculate manhattan distance given latitude and longitude coordinates
def manhattan_distance(lat, lon):
    a = np.abs(lat -lon)
    return a

# Function to add date features
def add_datepart(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)

### Combining training and test data for efficiency

In [0]:
# Extracting the target variable
target = train.burn_area

# Creating a separator column
train['separator'] = 0
test['separator'] = 1

# Aligning the train and test sets
train, test = train.align(test, join = 'inner', axis = 1)

# Combining the train and test set for efficiency in generating features
comb = pd.concat([train, test])

### Feature Engineering

In [0]:
# Adding date features
add_datepart(comb, 'date', False)

# Adding cyclic date features
add_cyclic_datepart(comb, 'date')

# Adding the manhattan distance column
comb['manhat_dist'] = manhattan_distance(comb.lon.values, comb.lat.values)

# Adding the bearing distance column
comb['bearing_dist'] = bearing_array(comb.lat.values, comb.lon.values)

# Combining year and week to form a new feature
comb['woy'] = comb.Year*100+comb.Week

### Separating training and test data from the combined dataframe

In [0]:
# Separating the train and test set from the combined dataframe
train = comb[comb.separator == 0]
test = comb[comb.separator == 1]

# Dropping the separator column as it has served its purpose
train.drop('separator', axis = 1, inplace = True)
test.drop('separator', axis = 1, inplace = True)

### Engineering a 'burnt' feature

In [0]:
# Creating alist to store values of whether a location was burnt or not
burnt = []
for i in train.burn_area:
  if i <= 0:
    burnt.append(0)
  else:
    burnt.append(1)

# Separating predictor variables and dependant variable from other variables
X = train.drop(['ID', 'area', 'burn_area'], axis = 1)
y = burnt

# Selecting relevant variables in the test dataset
tes = test.drop(['ID', 'area', 'burn_area'], axis = 1)

# Training a model to classify whether a region was burnt or not
catt = CatBoostClassifier(verbose = False)
catt.fit(X, y)

# Making predictions
predds = catt.predict(tes)

# Creating burnt columns from the predictions
train['burnt'] = burnt
test['burnt'] = predds

### Previewing a sample of the engineered dataframe

In [44]:
# Sampling 10 observations of the training set
train.sample(10)

Unnamed: 0,ID,area,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_swe,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,population_density,precipitation,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,weekday_cos,weekday_sin,day_month_cos,day_month_sin,month_year_cos,month_year_sin,day_year_cos,day_year_sin,manhat_dist,bearing_dist,woy,burnt
28759,2012_2000-11-01,2012,17.825,-3.431,0.0,1000.532194,0.0,373.231075,1000.532194,286.776267,186.654471,1506.791912,1720.548682,0.0,214.944879,306.964015,2601.136358,90.455771,101.566374,337.801381,0.001537,0.0,0.974958,0.0,0.002152,0.0,0.021354,0.0,0.0,7.584947,0.323575,2000,11,44,1,2,306,False,True,False,False,False,False,973036800,-0.222521,0.974928,1.0,0.0,0.5,-0.866025,0.5,-0.866025,21.256,44.837866,200044,0
87245,3183_2002-03-01,3183,27.708,-7.323,0.0,938.647573,0.0,-204.079089,938.647573,197.617256,76.971001,2020.200719,1393.772558,0.0,193.696705,309.123907,2445.954823,92.927382,146.55722,659.603319,0.0,0.0,0.03627,0.0,0.96373,0.0,0.0,0.0,0.0,12.758791,0.27612,2002,3,9,1,4,60,False,True,False,False,False,False,1014940800,-0.900969,-0.433884,1.0,0.0,0.5,0.866025,0.527078,0.849817,35.031,44.609663,200209,0
68117,3160_2001-10-01,3160,22.581,-7.441,0.0,365.512091,910.111376,-938.289458,1275.602859,35.891959,2.0,169.235021,2122.197141,0.0,181.276831,301.168434,2255.476712,93.552591,198.565217,853.864501,0.0,0.0,0.717625,0.0,0.281146,0.0,0.0,0.0,0.00123,33.801115,0.284258,2001,10,40,1,0,274,False,True,False,True,False,False,1001894400,1.0,0.0,1.0,0.0,-1.83697e-16,-1.0,-0.01291,-0.999917,30.022,44.767693,200140,0
342446,2377_2007-10-01,2377,13.322,-4.79,0.0,710.580766,0.0,429.945474,710.580766,163.744343,8.0,2382.501204,927.464492,0.0,205.633004,286.233028,2319.494945,86.458835,129.679104,491.277387,0.0,0.0,0.995168,0.0,0.002459,0.0,0.002372,0.0,0.0,49.783539,0.366944,2007,10,40,1,0,274,False,True,False,True,False,False,1191196800,1.0,0.0,1.0,0.0,-1.83697e-16,-1.0,-0.01291,-0.999917,18.112,44.924152,200740,0
105174,2007_2002-08-01,2007,16.701,-3.434,0.02538,556.525153,484.062432,-190.973586,1040.53872,20.943571,1.0,879.99988,1723.680634,0.0,195.149238,306.957498,2285.321767,107.443751,119.237003,416.683707,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.115756,0.16974,2002,8,31,1,3,213,False,True,False,False,False,False,1028160000,-0.900969,0.433884,1.0,0.0,-0.8660254,-0.5,-0.873807,-0.486273,20.135,44.854464,200231,1
99173,3648_2002-06-01,3648,23.326,-10.561,0.027823,548.980169,602.706329,-408.085144,1151.683292,0.0,0.0,1069.828049,1975.717729,0.0,100.137751,299.095594,1212.673317,152.483434,193.445553,1046.815181,0.0,0.0,0.472876,0.0,0.527124,0.0,0.0,0.0,0.0,10.945428,4.3e-05,2002,6,22,1,5,152,False,True,False,False,False,False,1022889600,-0.222521,-0.974928,1.0,0.0,-0.8660254,0.5,-0.856551,0.516062,33.887,44.933746,200222,1
45740,3709_2001-03-01,3709,25.145,-10.953,0.0,853.468491,0.0,-255.16868,853.468491,263.485312,178.133381,2308.324212,1472.98081,0.0,168.122957,251.439588,2032.385691,53.228263,142.323502,1330.688083,0.0,0.0,0.425422,0.0,0.573042,0.0,0.000307,0.0,0.001229,8.130839,0.27288,2001,3,9,1,3,60,False,True,False,False,False,False,983404800,-0.900969,0.433884,1.0,0.0,0.5,0.866025,0.527078,0.849817,36.098,44.890821,200109,0
287166,591_2006-08-01,591,30.232,2.359,0.0,924.229044,0.0,379.52961,924.229044,338.272344,207.061995,1350.884091,1660.196357,0.0,165.534073,262.97636,2171.331926,49.037631,165.550115,1305.15154,0.0,0.0,0.994325,0.0,0.005367,0.0,0.000307,0.0,0.0,171.425368,0.204666,2006,8,31,1,1,213,False,True,False,False,False,False,1154390400,0.62349,0.781831,1.0,0.0,-0.8660254,-0.5,-0.873807,-0.486273,27.873,45.375505,200631,0
616475,1294_2013-10-01,1294,24.296,-0.488,0.0,1133.28457,0.0,-824.516572,1133.28457,183.583101,70.250698,303.917688,1945.832706,0.0,201.917688,304.353891,2515.410222,85.810368,123.618186,429.312009,0.0,0.0,0.999693,0.0,0.0,0.0,0.000307,0.0,0.0,8.754024,0.442307,2013,10,40,1,1,274,False,True,False,True,False,False,1380585600,0.62349,0.781831,1.0,0.0,-1.83697e-16,-1.0,-0.01291,-0.999917,24.784,44.949753,201340,0
349711,2000_2007-12-01,2000,28.28,-3.123,0.0,1080.048072,0.0,327.435542,1080.048072,345.964699,237.99747,1992.561807,1668.50747,0.0,186.63494,284.90241,2004.229759,103.445181,171.248072,1169.984508,0.0,0.0,0.983171,0.0,0.0,0.0,0.016829,0.0,0.0,36.898173,0.199186,2007,12,48,1,5,335,False,True,False,False,False,False,1196467200,-0.222521,-0.974928,1.0,0.0,0.8660254,-0.5,0.860961,-0.508671,31.403,44.701915,200748,0


In [45]:
# Sampling 10 observations of the test set
test.sample(10)

Unnamed: 0,ID,area,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_swe,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,population_density,precipitation,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,weekday_cos,weekday_sin,day_month_cos,day_month_sin,month_year_cos,month_year_sin,day_year_cos,day_year_sin,manhat_dist,bearing_dist,woy,burnt
38572,362_2014-11-01,362,29.156,3.248,0.0,1038.69965,281.743092,-864.319657,1320.316761,61.383613,3.0,808.848799,2332.032098,0.0,191.143116,312.021117,2029.85254,136.405213,149.641607,858.210873,0.0,0.0,0.927537,0.0,0.06368,0.0,0.008783,0.0,0.0,16.238297,0.061934,2014,11,44,1,5,305,False,True,False,False,False,False,1414800000,-0.222521,-0.974928,1.0,0.0,0.5,-0.866025,0.497513,-0.867456,25.908,45.529515,201444,1
92420,716_2016-01-01,716,19.168,1.762,0.0,233.252867,1172.825468,-664.632951,1406.154737,13.599034,1.0,234.130235,2552.994931,0.0,201.043693,316.05866,2221.202052,129.725287,112.664454,340.184166,0.006589,0.0,0.972644,0.0,0.0,0.0,0.020766,0.0,0.0,14.089761,0.00629,2016,1,53,1,4,1,False,True,False,True,False,True,1451606400,-0.900969,-0.433884,1.0,0.0,1.0,0.0,1.0,0.0,17.406,45.177606,201653,0
39454,1244_2014-11-01,1244,25.181,-0.264,0.0,1091.27528,0.0,-949.18115,1091.27528,158.384476,49.228275,410.498132,2060.172713,0.0,190.599012,294.725443,2499.414367,67.790647,120.061589,452.107433,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.462664,0.360877,2014,11,44,1,5,305,False,True,False,False,False,False,1414800000,-0.222521,-0.974928,1.0,0.0,0.5,-0.866025,0.497513,-0.867456,25.445,44.971189,201444,0
95228,3524_2016-01-01,3524,27.329,-9.324,0.0,911.084256,0.0,-337.548495,911.084256,160.458884,67.582411,1928.076996,1290.778412,0.0,195.228371,295.970011,2353.894442,86.86838,171.688326,950.956063,0.0,0.0,0.005421,0.0,0.983745,0.000615,0.004994,0.0,0.005225,16.126879,0.231482,2016,1,53,1,4,1,False,True,False,True,False,True,1451606400,-0.900969,-0.433884,1.0,0.0,1.0,0.0,1.0,0.0,36.653,44.689274,201653,0
129176,3083_2016-10-01,3083,28.137,-6.87,0.0,666.955944,842.336406,-337.514665,1509.407279,65.380462,3.00431,259.617024,2412.241111,0.0,198.876811,315.750389,2178.958578,131.914881,212.128337,688.965307,0.0,0.0,0.640213,0.0,0.354672,0.0,0.005114,0.0,0.0,26.678158,0.169762,2016,10,39,1,5,275,False,True,False,True,False,False,1475280000,-0.222521,-0.974928,1.0,0.0,-1.83697e-16,-1.0,-0.008583,-0.999963,35.007,44.592756,201639,0
80661,420_2015-10-01,420,29.806,3.02,0.0,1138.829307,0.0,-820.567528,1138.829307,181.858541,45.805267,1405.354796,1923.50447,0.0,191.404566,289.983692,2182.726746,94.096521,148.560522,955.121693,0.0,0.0,0.967142,0.0,0.031184,0.0,0.001674,0.0,0.0,27.850112,0.35636,2015,10,40,1,3,274,False,True,False,True,False,False,1443657600,-0.900969,0.433884,1.0,0.0,-1.83697e-16,-1.0,-0.01291,-0.999917,26.786,45.495025,201540,0
43111,1080_2014-12-01,1080,25.181,0.403,0.0,962.847435,146.09487,-889.80857,1108.977308,74.926011,3.943754,324.968859,2006.496681,0.0,198.290766,296.870247,2551.319131,70.434158,122.576705,433.959832,0.000307,0.0,0.907894,0.0,0.0,0.0,0.033989,0.000615,0.057195,254.64071,0.093797,2014,12,49,1,0,335,False,True,False,False,False,False,1417392000,1.0,0.0,1.0,0.0,0.8660254,-0.5,0.860961,-0.508671,24.778,45.046574,201449,0
36157,1768_2014-10-01,1768,27.832,-2.246,0.0,1201.229589,0.0,-920.402415,1201.229589,148.270169,7.297947,456.632005,2213.503744,0.0,168.42971,275.533575,2111.988889,70.433937,157.438768,824.433151,0.0,0.0,0.998156,0.0,0.0,0.0,0.001844,0.0,0.0,13.358513,0.408906,2014,10,40,1,2,274,False,True,False,True,False,False,1412121600,-0.222521,0.974928,1.0,0.0,-1.83697e-16,-1.0,-0.01291,-0.999917,30.078,44.771034,201440,0
27133,386_2014-08-01,386,22.305,3.099,0.0,1056.223737,0.0,-231.357375,1056.223737,183.432035,77.779882,1435.318176,1936.600169,0.0,195.717284,290.27753,2512.217223,64.715595,88.144494,485.957439,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,17.333804,0.455457,2014,8,31,1,4,213,False,True,False,False,False,False,1406851200,-0.900969,-0.433884,1.0,0.0,-0.8660254,-0.5,-0.873807,-0.486273,19.206,45.397155,201431,0
10304,2662_2014-03-01,2662,27.438,-5.342,0.0,1067.527677,28.843442,-322.544765,1096.19651,108.001564,5.110108,2155.961372,1812.626835,0.0,194.436703,302.323105,2460.190493,83.453911,146.83935,667.25605,0.0,0.0,0.719324,0.0,0.276373,0.0,0.001229,0.0,0.003073,48.898685,0.143238,2014,3,9,1,5,60,False,True,False,False,False,False,1393632000,-0.222521,-0.974928,1.0,0.0,0.5,0.866025,0.527078,0.849817,32.78,44.622249,201409,0


In [46]:
# Checking the shape of training and test sets
train.shape, test.shape

((626644, 56), (137556, 56))

### Training and making predictions

In [0]:
X = train.drop(['ID', 'area', 'burn_area'], axis = 1)
y = train.burn_area

tes = test.drop(['ID', 'area', 'burn_area'], axis = 1)

predictions = []
for i in range(10):
  # Training the model with different seeds
  cat = CatBoostRegressor(verbose = False, depth = 9, iterations = 1500, random_seed = i)
  cat.fit(X, y)

  # Making predictions
  preds = cat.predict(tes)
  predictions.append(preds)

# Averaging the preictions
preds = np.mean(predictions, axis = 0)

# Creating submission file
sub_df = pd.DataFrame({'ID': test.ID, 'Prediction': preds})
sub_df.to_csv('submission.csv', index = False)