# Setup

Load the data for the competition.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
!pip install datasist

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datasist as ds
import warnings
from sklearn.metrics import mean_squared_error, r2_score

warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('max_column', 100)

In [None]:
train = pd.read_csv('/kaggle/input/xente-challenge/train.csv', parse_dates=['date'])
train.head()

In [None]:
train.shape

In [None]:
test = pd.read_csv('/kaggle/input/xente-challenge/test.csv', parse_dates=['date'])
test.head(1)

In [None]:
train = ds.feature_engineering.drop_redundant(train)
test = ds.feature_engineering.drop_redundant(test)

In [None]:
test.isna().sum()

In [None]:
train.fillna(-999, inplace=True)
test.fillna(-999, inplace=True)

In [None]:
train.describe()

In [None]:
sns.scatterplot(x='elevation', y='elevation', data=train)

In [None]:
d = train[train['elevation'] > 2500].index
train.drop(d, inplace=True)

In [None]:
sns.distplot(train['elevation'])

In [None]:
train['elevation'] = np.log1p(train['elevation'])
test['elevation'] = np.log1p(test['elevation'])

In [None]:
ds.visualizations.histogram(train, fig_size=(5,5), bins=5)

# EDA + Feature Engineering

In [None]:
# Look at correlation with target
train.corr()['burn_area'].sort_values().plot(kind='bar', figsize=(18, 6))

In [None]:
# Look at some scatter plots (only plotting for a subset of data to keep things fast)
sample = train.sample(10000)
plt.scatter(sample['climate_vap'], sample['burn_area'], alpha=0.3)

In [None]:
# Higher temp -> more fires it looks like
plt.scatter(sample['climate_tmmx'], sample['burn_area'], alpha=0.3)

In [None]:
train.head()

In [None]:
train['climate_aet_pr'] = train['climate_aet'] + train['climate_pr']
test['climate_aet_pr'] = test['climate_aet'] + test['climate_pr']

In [None]:
train.head()

In [None]:
sns.scatterplot(x='climate_pr', y='precipitation', data=train)

In [None]:
train['climate_vap_tmmn'] = train['climate_vap'] / train['climate_tmmn']
test['climate_vap_tmmn'] = test['climate_vap'] / test['climate_tmmn']

In [None]:
sns.scatterplot(x='climate_def', y='climate_vpd', data=train)

In [None]:
train['climate_vpd_def'] = train['climate_def'] + train['climate_vpd']
test['climate_vpd_def'] = test['climate_def'] + test['climate_vpd']

In [None]:
train['climate_def'].value_counts()

In [None]:
train.describe()

In [None]:
train['landcover02346'] = train['landcover_0']+train['landcover_2']+train['landcover_3']+train['landcover_4']+train['landcover_6']
test['landcover02346'] = test['landcover_0']+test['landcover_2']+test['landcover_3']+test['landcover_4']+test['landcover_6']

In [None]:
train.drop(columns=['landcover_1', 'landcover_5', 'landcover_7'], inplace=True)
test.drop(columns=['landcover_1', 'landcover_5', 'landcover_7'], inplace=True)

In [None]:
train.head()

In [None]:
# Date
train = ds.timeseries.extract_dates(data=train, date_cols=['date'])
test = ds.timeseries.extract_dates(data=test, date_cols=['date'])

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.head()

In [None]:
train = ds.feature_engineering.drop_redundant(train)
test = ds.feature_engineering.drop_redundant(test)

In [None]:
train_len = len(train)

In [None]:
new_df = pd.concat([train, test])

In [None]:
new_df.head()

In [None]:
#Manhattan distnace
def manhattan_distance(lat, lon):
    a = np.abs(lat -lon)
    return a
new_df['manhattan_dist'] = manhattan_distance(new_df['lat'].values, new_df['lon'].values,)
new_df.head()

In [None]:
#Bearing
def bearing_array(lat, lng):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lat - lng)
    lat, lng = map(np.radians, (lat, lng))
    y = np.sin(lng_delta_rad) * np.cos(lat)
    x = np.cos(lat) * np.sin(lat) - np.sin(lng) * np.cos(lng) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))
new_df['bearing'] = bearing_array(new_df['lat'].values, new_df['lon'].values,)
new_df.head()

In [None]:
train = new_df[:train_len]
test = new_df[train_len:]

In [None]:
test.drop(columns='burn_area', inplace=True)

In [None]:
train['landcover02346'].value_counts()

In [None]:
plt.figure(figsize=(25,20))
sns.heatmap(train.corr(), annot=True)

##  Adding date features

In [None]:
train.head()

# Data Split for Validation

We don't want to just split randomly - this would give us artificially high scores. Instead, let's use the last 3 years of the dataset for validation to more closely match the test configuration.

In [None]:
X = train.drop(columns='burn_area')
y = train['burn_area']

In [None]:
X.drop(columns='ID', inplace=True)
test.drop(columns='ID', inplace=True)

In [None]:
categorical_features_indices = np.where(X.dtypes == np.object)[0]; categorical_features_indices

In [None]:
from lightgbm import LGBMRegressor

In [None]:
errcb2=[]
y_pred_totcb2=[]
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold,StratifiedKFold, TimeSeriesSplit
from sklearn.metrics import mean_squared_error
fold=KFold(n_splits=3, random_state=1234)
i=1
for train_index, test_index in fold.split(X,y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    m2=CatBoostRegressor(iterations=1000, logging_level='Silent', od_wait=50, od_type='Iter', learning_rate=0.1, depth=8, eval_metric='RMSE')
    m2.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_test, y_test)], early_stopping_rounds=100,verbose=100, cat_features=categorical_features_indices)
    preds=m2.predict(X_test)
    print("err: ",np.sqrt(mean_squared_error(y_test,preds)))
    errcb2.append(np.sqrt(mean_squared_error(y_test,preds)))
    p2 = m2.predict(test)
    y_pred_totcb2.append(p2)

In [None]:
d = {"ID": sub_id, 'Prediction': np.mean(y_pred_totcb2, 0)}
test_prediction1 = pd.DataFrame(data=d)
test_prediction1 = test_prediction1[["ID", 'Prediction']]

In [None]:
test_prediction1.head()

In [None]:
test_prediction1.to_csv('sub100.csv', index=False)