### Project Storm prediction in North of Madagascar
The porpuse of this project is a machine learning focused on forcasting thunderstorms in northern Madagascar, particularly around Nosy Be. The project aims to provide accurate short-term predictions (0–6 hours) to mitigate risks, protect lives, and support emergency responses in this vulnerable region.

In [6]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [7]:
train_df = pd.read_csv('./Data/train.csv')
test_df = pd.read_csv('./Data/test.csv')

In [8]:
train_df.head(3)

Unnamed: 0,year,month,day,hour,minute,lat,lon,intensity,size,distance,Storm_NosyBe_1h,Storm_NosyBe_3h
0,2004,1,19,10,30,-13.6126,48.2281,468,1422,10.44,0,1
1,2004,1,19,10,45,-13.7039,48.2598,488,1881,13.34,0,1
2,2004,1,19,11,0,-13.7953,48.2918,424,1746,16.28,0,1


In [9]:
train_df.describe()

Unnamed: 0,year,month,day,hour,minute,lat,lon,intensity,size,distance,Storm_NosyBe_1h,Storm_NosyBe_3h
count,51077.0,51077.0,51077.0,51077.0,51077.0,51077.0,51077.0,51077.0,51077.0,51077.0,51077.0,51077.0
mean,2011.761908,4.760146,15.965444,13.619144,22.507489,-13.63066,48.77961,210.809934,3936.537483,24.690407,0.063571,0.056327
std,4.645697,4.311285,8.710566,5.576324,16.762922,0.618119,0.750511,86.580096,5694.273869,13.422355,0.243989,0.230554
min,2004.0,1.0,1.0,0.0,0.0,-14.9995,47.5003,87.0,45.0,0.0,0.0,0.0
25%,2008.0,2.0,8.0,11.0,15.0,-14.081,48.1783,144.0,621.0,14.32,0.0,0.0
50%,2012.0,3.0,16.0,14.0,30.0,-13.5401,48.7577,190.0,1773.0,21.93,0.0,0.0
75%,2016.0,11.0,24.0,17.0,30.0,-13.1697,49.3077,258.0,4770.0,33.84,0.0,0.0
max,2019.0,12.0,31.0,23.0,45.0,-12.5004,50.4969,928.0,85266.0,61.74,1.0,1.0


## Data Preparation


In [10]:
def create_time_features(df):
    # Convert time columns to datetime
    df['datetime'] = pd.to_datetime(df[['year', 'month', 'day', 'hour', 'minute']])
    # Extract time features
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df['hour_sin'] = np.sin(2 * np.pi * df['hour']/24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour']/24)
    df['minute_sin'] = np.sin(2 * np.pi * df['minute']/24)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute']/24)
    return df



In [11]:
def create_spatial_features(df):
    # Avoid division by zero and handle size=0
    df['intensity_density'] = df['intensity'] / (df['size'].replace(1, np.nan))
    df['intensity_density'] = df['intensity_density'].fillna(0)
    df['storm_proximity'] = 1 / (df['distance'] + 1)
    return df

In [12]:
def create_storm_features(df):
    # Nosy Be Specific Cyclone Season (November to April)
    df['is_peak_cyclone_season'] = df['month'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
    df['is_cyclone_season'] = df['month'].apply(lambda x: 1 if x in [11, 12, 1, 2, 3, 4] else 0)

    # Assign weights to months based on historical cyclone data
    cyclone_weights = {1: 0.9, 2: 0.8, 3: 0.7, 4: 0.4, 11: 0.6, 12: 0.7}
    df['cyclone_season_weight'] = df['month'].map(cyclone_weights).fillna(0)

    # Define day as 6 AM to 6 PM
    df['is_daytime'] = df['hour'].apply(lambda x: 1 if 6 <= x < 18 else 0)

    df['cyclone_daytime_interaction'] = df['is_cyclone_season'] * df['is_daytime']
    df['peak_cyclone_daytime_interaction'] = df['is_peak_cyclone_season'] * df['is_daytime']
    
    return df

In [13]:
def add_lag_features(df, lag_features, intervals):
    df = df.sort_values('datetime').reset_index(drop=True)
    for feat in lag_features:
        for lag_min, lag_steps in intervals.items():
            lag_col = f"{feat}_{lag_min}"
            df[lag_col] = df[feat].shift(lag_steps)
            df[lag_col] = df[lag_col].fillna(0)
    return df

def add_size_features(df):
    df['size_change_30'] = df['size'] - df['size_30']

    return df

In [14]:
def latlon_to_xy(df, lat_ref = -13.3 , lon_ref = 48.3 ):
    R = 6371.0  # Earth radius in kilometers
    rad = np.pi/180.0
    
    delta_lat = (df['lat'] - lat_ref) * rad
    delta_lon = (df['lon'] - lon_ref) * rad
    
    df['distance_y'] = delta_lat * R
    df['distance_x'] = delta_lon * R * np.cos(lat_ref * rad)

    df['radial_distance'] = np.sqrt(df['distance_x']**2 + df['distance_y']**2)
    df['bearing'] = np.arctan2(df['distance_y'], df['distance_x'])
    df['intensity_distance_interaction'] = df['radial_distance'] * df['intensity']
    return df

In [15]:
# Apply feature engineering
train_df = create_time_features(train_df)
test_df = create_time_features(test_df)

train_df = create_spatial_features(train_df)
test_df = create_spatial_features(test_df)

train_df = create_storm_features(train_df)
test_df = create_storm_features(test_df)

train_df = latlon_to_xy(train_df)
test_df = latlon_to_xy(test_df)

# Define lag features and intervals
lag_features = ['intensity', 'size', 'distance', 'intensity_density', 'minute_sin', 'minute_cos', 'bearing']
lag_intervals = {30: 2, 60: 4}  # 30min -> 2 steps, 60min -> 4 steps

train_df = add_lag_features(train_df, lag_features, lag_intervals)
test_df = add_lag_features(test_df, lag_features, lag_intervals)

train_df = add_size_features(train_df)
test_df = add_size_features(test_df)

In [16]:
train_df.head(3)

Unnamed: 0,year,month,day,hour,minute,lat,lon,intensity,size,distance,...,distance_60,intensity_density_30,intensity_density_60,minute_sin_30,minute_sin_60,minute_cos_30,minute_cos_60,bearing_30,bearing_60,size_change_30
0,2004,1,19,10,30,-13.6126,48.2281,468,1422,10.44,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1422.0
1,2004,1,19,10,45,-13.7039,48.2598,488,1881,13.34,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1881.0
2,2004,1,19,11,0,-13.7953,48.2918,424,1746,16.28,...,0.0,0.329114,0.0,1.0,0.0,1.19434e-15,0.0,-1.791004,0.0,324.0


In [17]:
train_df.columns

Index(['year', 'month', 'day', 'hour', 'minute', 'lat', 'lon', 'intensity',
       'size', 'distance', 'Storm_NosyBe_1h', 'Storm_NosyBe_3h', 'datetime',
       'day_of_week', 'hour_sin', 'hour_cos', 'minute_sin', 'minute_cos',
       'intensity_density', 'storm_proximity', 'is_peak_cyclone_season',
       'is_cyclone_season', 'cyclone_season_weight', 'is_daytime',
       'cyclone_daytime_interaction', 'peak_cyclone_daytime_interaction',
       'distance_y', 'distance_x', 'radial_distance', 'bearing',
       'intensity_distance_interaction', 'intensity_30', 'intensity_60',
       'size_30', 'size_60', 'distance_30', 'distance_60',
       'intensity_density_30', 'intensity_density_60', 'minute_sin_30',
       'minute_sin_60', 'minute_cos_30', 'minute_cos_60', 'bearing_30',
       'bearing_60', 'size_change_30'],
      dtype='object')