<a href="https://colab.research.google.com/github/DariusTheGeek/3rd_place_solution_for_the__UmojaHack-3-Hotspots__zindi_hackathon/blob/master/3rd_place_solution_for_the__UmojaHack_3_Hotspots__zindi_hackathon.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing necessary library

In [1]:
# Installing catboost
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/94/ec/12b9a42b2ea7dfe5b602f235692ab2b61ee1334ff34334a15902272869e8/catboost-0.22-cp36-none-manylinux1_x86_64.whl (64.4MB)
[K     |████████████████████████████████| 64.4MB 72kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.22


### Loading libraries

In [0]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import datetime as dt
from catboost import CatBoostRegressor, CatBoostClassifier
import re
from fastai.tabular import *
import warnings
warnings.filterwarnings('ignore')

### Mounting colab drive

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### Loading data

In [0]:
# Loading files
train = pd.read_csv('/content/drive/My Drive/Hacck/train.csv', parse_dates=['date'])
test = pd.read_csv('/content/drive/My Drive/Hacck/test.csv', parse_dates=['date'])
ss = pd.read_csv('/content/drive/My Drive/Hacck/SampleSubmission.csv')

### Feature engineering functions

In [0]:
# Function to calculate bearing distance given latitude and longitude coordinates
def bearing_array(lat, lng):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lat - lng)
    lat, lng = map(np.radians, (lat, lng))
    y = np.sin(lng_delta_rad) * np.cos(lat)
    x = np.cos(lat) * np.sin(lat) - np.sin(lng) * np.cos(lng) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

# Function to calculate manhattan distance given latitude and longitude coordinates
def manhattan_distance(lat, lon):
    a = np.abs(lat -lon)
    return a

# Function to add date features
def add_datepart(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)

### Combining training and test data for efficiency

In [0]:
# Extracting the target variable
target = train.burn_area

# Creating a separator column
train['separator'] = 0
test['separator'] = 1

# Aligning the train and test sets
train, test = train.align(test, join = 'inner', axis = 1)

# Combining the train and test set for efficiency in generating features
comb = pd.concat([train, test])

### Feature Engineering

In [0]:
# Adding date features
add_datepart(comb, 'date', False)

# Adding cyclic date features
add_cyclic_datepart(comb, 'date')

# Adding the manhattan distance column
comb['manhat_dist'] = manhattan_distance(comb.lon.values, comb.lat.values)

# Adding the bearing distance column
comb['bearing_dist'] = bearing_array(comb.lat.values, comb.lon.values)

# Combining year and week to form a new feature
comb['woy'] = comb.Year*100+comb.Week

### Separating training and test data from the combined dataframe

In [0]:
# Separating the train and test set from the combined dataframe
train = comb[comb.separator == 0]
test = comb[comb.separator == 1]

# Dropping the separator column as it has served its purpose
train.drop('separator', axis = 1, inplace = True)
test.drop('separator', axis = 1, inplace = True)

### Engineering a 'burnt' feature

In [0]:
# Creating alist to store values of whether a location was burnt or not
burnt = []
for i in train.burn_area:
  if i <= 0:
    burnt.append(0)
  else:
    burnt.append(1)

# Separating predictor variables and dependant variable from other variables
X = train.drop(['ID', 'area', 'burn_area'], axis = 1)
y = burnt

# Selecting relevant variables in the test dataset
tes = test.drop(['ID', 'area'], axis = 1)

# Training a model to classify whether a region was burnt or not
catt = CatBoostClassifier(verbose = False)
catt.fit(X, y)

# Making predictions
predds = catt.predict(tes)

# Creating burnt columns from the predictions
train['burnt'] = burnt
test['burnt'] = predds

### Previewing a sample of the engineered dataframe

In [11]:
# Sampling 10 observations of the dataframe
train.sample(10)

Unnamed: 0,ID,area,lat,lon,burn_area,climate_aet,climate_def,climate_pdsi,climate_pet,climate_pr,climate_ro,climate_soil,climate_srad,climate_swe,climate_tmmn,climate_tmmx,climate_vap,climate_vpd,climate_vs,elevation,landcover_0,landcover_1,landcover_2,landcover_3,landcover_4,landcover_5,landcover_6,landcover_7,landcover_8,population_density,precipitation,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,Elapsed,weekday_cos,weekday_sin,day_month_cos,day_month_sin,month_year_cos,month_year_sin,day_year_cos,day_year_sin,manhat_dist,bearing_dist,woy,burnt
386002,81_2008-10-01,81,24.99,4.413,0.0,1242.523712,0.0,103.25012,1242.523712,178.802841,54.520462,1697.125181,2292.017333,0.0,195.756018,301.062951,2644.0591,64.5,124.829442,612.943596,0.0,0.0,0.97691,0.0,0.02309,0.0,0.0,0.0,0.0,4.695838,0.195834,2008,10,40,1,2,275,False,True,False,True,False,False,1222819200,-0.222521,0.974928,1.0,0.0,-1.83697e-16,-1.0,-0.008583,-0.999963,20.577,45.680146,200840,0
381844,3565_2008-08-01,3565,23.534,-9.884,0.006801,238.619727,1219.216993,-100.559121,1457.799881,6.145692,0.0,540.79596,2188.006298,0.0,135.364231,310.875936,1178.345336,186.933571,220.138919,980.063959,0.0,0.0,0.240196,0.0,0.759497,0.0,0.000307,0.0,0.0,11.078393,0.002698,2008,8,31,1,4,214,False,True,False,False,False,False,1217548800,-0.900969,-0.433884,1.0,0.0,-0.8660254,-0.5,-0.870285,-0.492548,33.418,44.870031,200831,1
53819,325_2001-07-01,325,20.966,3.332,0.0,932.833213,0.0,-291.756392,932.833213,133.557525,40.368066,1336.773999,1727.944645,0.0,193.547274,287.180777,2655.285335,45.308008,84.86698,389.229882,0.0,0.0,0.981409,0.0,0.0,0.0,0.013673,0.0,0.004918,20.488013,0.170631,2001,7,26,1,6,182,False,True,False,True,False,False,993945600,0.62349,-0.781831,1.0,0.0,-1.0,1.224647e-16,-0.999667,0.025818,17.634,45.41329,200126,0
126825,732_2003-02-01,732,22.744,1.751,0.0,408.284216,679.582901,-178.113228,1087.962257,34.720367,1.768359,198.570602,2159.61992,0.0,191.023152,313.040637,2604.48957,80.009164,100.971904,426.924307,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,4.87556,0.18201,2003,2,5,1,5,32,False,True,False,False,False,False,1044057600,-0.222521,-0.974928,1.0,0.0,0.8660254,0.5,0.860961,0.508671,20.993,45.205296,200305,0
285348,2594_2006-07-01,2594,28.971,-5.089,0.003995,676.65846,430.976822,7.480125,1107.586526,25.758977,1.249189,1182.704816,1947.716104,0.0,126.022817,256.003843,1347.832112,106.17113,185.757416,1578.912553,0.0,0.0,0.995697,0.0,0.001537,0.0,0.002766,0.0,0.0,22.029087,0.012271,2006,7,26,1,5,182,False,True,False,True,False,False,1151712000,-0.222521,-0.974928,1.0,0.0,-1.0,1.224647e-16,-0.999667,0.025818,34.06,44.5951,200626,1
526047,2570_2011-10-01,2570,23.665,-5.185,0.0,1150.606693,0.0,-121.453161,1150.606693,194.991244,35.678062,1235.714166,2020.563632,0.0,181.7721,304.711527,2571.547079,66.671225,154.868418,598.748185,0.0,0.0,0.998247,0.0,0.0,0.0,0.001753,0.0,0.0,11.732625,0.448417,2011,10,39,1,5,274,False,True,False,True,False,False,1317427200,-0.222521,-0.974928,1.0,0.0,-1.83697e-16,-1.0,-0.01291,-0.999917,28.85,44.708234,201139,0
329330,724_2007-07-01,724,20.959,1.757,0.0,781.785525,0.0,-429.175271,781.785525,235.332569,118.763329,865.997105,1431.199156,0.0,194.605308,284.908926,2824.786852,26.546441,84.159469,392.922629,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,21.094335,0.268094,2007,7,26,1,6,182,False,True,False,True,False,False,1183248000,0.62349,-0.781831,1.0,0.0,-1.0,1.224647e-16,-0.999667,0.025818,19.202,45.191525,200726,0
91202,3319_2002-04-01,3319,28.611,-7.956,0.0,1080.07302,165.554854,-230.3514,1245.561882,83.790232,4.117094,2019.752591,1924.459797,0.0,195.528767,312.765217,2275.823705,116.089934,251.405479,857.155597,0.0,0.0,0.047134,0.0,0.951329,0.0,0.000615,0.0,0.000922,16.097804,0.201363,2002,4,14,1,0,91,False,True,False,True,False,False,1017619200,1.0,0.0,1.0,0.0,6.123234000000001e-17,1.0,0.021516,0.999769,36.567,44.593724,200214,0
588288,3675_2013-02-01,3675,23.106,-10.791,0.0,1030.63462,0.0,289.52439,1030.63462,206.215368,103.11165,2661.575776,1814.0695,0.0,171.473715,294.844068,2159.672152,89.189202,191.057779,1100.099774,0.0,0.0,0.675716,0.0,0.317527,0.0,0.003682,0.0,0.003074,22.886482,0.183289,2013,2,5,1,4,32,False,True,False,False,False,False,1359676800,-0.900969,-0.433884,1.0,0.0,0.8660254,0.5,0.860961,0.508671,33.897,44.963854,201305,0
139494,1938_2003-05-01,1938,27.4,-2.913,0.0,1185.126178,0.0,-176.460479,1185.126178,215.781363,97.328136,2466.014624,2037.467851,0.0,193.656998,297.8876,2352.979695,88.148054,193.761784,732.677125,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,10.68167,0.303963,2003,5,18,1,3,121,False,True,False,False,False,False,1051747200,-0.900969,0.433884,1.0,0.0,-0.5,0.8660254,-0.474951,0.880012,30.313,44.727294,200318,0


### Training and making preictions

In [0]:
X = train.drop(['ID', 'area', 'burn_area'], axis = 1)
y = train.burn_area

tes = test.drop(['ID', 'area'], axis = 1)

predictions = []
for i in range(10):
  # Training the model with different seeds
  cat = CatBoostRegressor(verbose = False, depth = 9, iterations = 1500, random_seed = i)
  cat.fit(X, y)

  # Making predictions
  preds = cat.predict(tes)
  predictions.append(preds)

# Averaging the preictions
preds = np.mean(predictions, axis = 0)

# Creating submission file
sub_df = pd.DataFrame({'ID': test.ID, 'Prediction': preds})
sub_df.to_csv('submission.csv', index = False)