In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy.engine import URL
from scipy.stats import pointbiserialr, pearsonr

In [2]:
DATABASE_TYPE="postgresql"
USERNAME="mutakabbir"
PASSWORD="lightning"
HOST="localhost"
PORT=5432
DATABASE_NAME="postgres"
DATABASE_URL = URL.create(
    DATABASE_TYPE,
    username=USERNAME,
    password=PASSWORD,  # plain (unescaped) text
    host=HOST,
    port=PORT,
) 
con = create_engine(f"{DATABASE_TYPE}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/{DATABASE_NAME}")

In [25]:
M = 6
N = 3
DIVISION_IDS = ['5']

In [26]:
def get_dataset_query(m:int, n:int,division_ids:list)->str: 
    query = f"""select * 
        from (
            select 
                fdd.division_id as fire_division_id, 
                fdd.start_date, 
                fdd.area_burnt_ha 
            from 
                fire_division_dataset fdd 
            where 
                fdd.division_id in ({",".join(division_ids)}) 
                and fdd.cause = 'L'
        ) as f
        inner join (
            select * 
            from ics_division_dataset idd
            where idd.division_id in ({",".join(division_ids)})
        ) as w on w.division_id = f.fire_division_id 
        where 
            w.record_date <= (f.start_date + make_interval(months => -{m}))
            and w.record_date >= (f.start_date + make_interval(months => -{m}) + make_interval(days => -{n}) )
        """
    return query

In [27]:
fire_query = get_dataset_query(m=0, n=N, division_ids=DIVISION_IDS)
non_fire_query = get_dataset_query(m=M, n=N, division_ids=DIVISION_IDS)

In [28]:
fire_df = pd.read_sql(fire_query, con=con)
fire_df['is_fire'] = 1
non_fire_df = pd.read_sql(non_fire_query, con=con) 
non_fire_df['is_fire'] = 0
non_fire_df['area_burnt_ha'] = 0
dataset = pd.concat([fire_df, non_fire_df], axis=0)

In [29]:
dataset

Unnamed: 0,fire_division_id,start_date,area_burnt_ha,division_id,record_date,extraterrestrial_irradiance,global_horizontal_irradiance,direct_normal_irradience,diffuse_horizontal_irradiance,global_horizontal_illumination_klux,...,weather_visibility_2,pressure_kpa,dry_bulb_temp_c,dew_point_temp_c,wind_direction_deg,wind_speed_mps,sky_cover,sky_cover_opaque,snow,is_fire
0,5,1998-05-19,10.766293,5,1998-05-16,1568.937500,745.125000,485.843750,426.822917,23.696875,...,0.000000,93.358854,11.595833,7.581250,163.333333,1.998958,7.421875,5.590909,0.0,1
1,5,1998-05-19,10.766293,5,1998-05-17,1577.250000,987.312500,901.614583,396.729167,31.127083,...,0.000000,93.240313,13.315625,6.879167,120.000000,1.988542,5.593750,3.454545,0.0,1
2,5,1998-05-19,10.766293,5,1998-05-18,1585.364583,705.781250,383.937500,456.541667,22.537500,...,0.000000,93.510208,12.600000,8.127083,143.229167,1.805208,8.171875,5.602273,0.0,1
3,5,1998-05-19,10.766293,5,1998-05-19,1593.354167,1033.864583,1016.802083,391.062500,32.501042,...,0.000000,93.604062,14.480208,6.136458,150.833333,1.787500,4.609375,2.425287,0.0,1
4,5,1998-06-05,18.000581,5,1998-06-02,1682.677083,951.843750,790.552083,437.322917,30.085417,...,0.000000,93.960833,12.281250,4.483333,170.312500,2.206250,7.140625,3.647727,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2403,5,2018-06-23,0.000000,5,2017-12-23,171.400000,85.975000,288.308333,44.550000,2.410000,...,0.000000,95.699000,-13.903333,-16.701667,196.083333,2.646667,3.875000,,1.0,0
2404,5,2018-06-23,0.000000,5,2017-12-20,171.758333,85.366667,299.433333,42.308333,2.375000,...,0.000000,94.449250,-12.607500,-14.492500,174.833333,2.392500,6.375000,,1.0,0
2405,5,2018-06-23,0.000000,5,2017-12-21,171.491667,67.341667,146.425000,46.208333,1.998333,...,0.166667,95.105083,-8.851667,-10.795000,214.416667,2.784167,7.062500,,1.0,0
2406,5,2018-06-23,0.000000,5,2017-12-22,171.325000,90.316667,300.283333,45.250000,2.539167,...,0.000000,95.611667,-13.852500,-15.944167,183.916667,2.384167,7.500000,,1.0,0


In [30]:
# fill na with mean 
dataset = dataset.fillna(dataset.mean())

  dataset = dataset.fillna(dataset.mean())


In [31]:
for column in dataset.columns[-32:-1]:
    corr = pointbiserialr(dataset['is_fire'], dataset[column])
    print(column)
    print(f"\tcorrelation: {corr[0]}")
    print(f"\tp-Value: {corr[1]}")


extraterrestrial_irradiance
	correlation: 0.9097509597344732
	p-Value: 0.0
global_horizontal_irradiance
	correlation: 0.9510665167965243
	p-Value: 0.0
direct_normal_irradience
	correlation: 0.7866309946781047
	p-Value: 0.0
diffuse_horizontal_irradiance
	correlation: 0.9514092428724962
	p-Value: 0.0
global_horizontal_illumination_klux
	correlation: 0.9519909399987407
	p-Value: 0.0
direct_normal_illumination_klux
	correlation: 0.8121274184179186
	p-Value: 0.0
diffuse_horizontal_illumination_klux
	correlation: 0.9487804202986834
	p-Value: 0.0
zenith_illumination
	correlation: nan
	p-Value: nan
sunlight_min
	correlation: 0.10161975263068007
	p-Value: 3.888349533606418e-12
ceiling_height_meters
	correlation: 0.2821739289684798
	p-Value: 9.604004681895146e-86
sky_layer_1
	correlation: -0.4836945763443841
	p-Value: 5.379153622651416e-271
sky_layer_2
	correlation: -0.08396966737016644
	p-Value: 9.992561421929514e-09
sky_layer_3
	correlation: 0.08947468957860255
	p-Value: 1.0086522756835776e-09



In [32]:
for column in dataset.columns[-32:-1]:
    corr = pearsonr(dataset['area_burnt_ha'], dataset[column])
    print(column)
    print(f"\tcorrelation: {corr[0]}")
    print(f"\tp-Value: {corr[1]}")



extraterrestrial_irradiance
	correlation: 0.1400265867670411
	p-Value: 9.061663166143735e-22
global_horizontal_irradiance
	correlation: 0.15694133781296815
	p-Value: 5.3797093790063254e-27
direct_normal_irradience
	correlation: 0.12479424016015962
	p-Value: 1.3962792217460388e-17
diffuse_horizontal_irradiance
	correlation: 0.16262760924367348
	p-Value: 6.852638358163085e-29
global_horizontal_illumination_klux
	correlation: 0.15968219241311085
	p-Value: 6.700623648596527e-28
direct_normal_illumination_klux
	correlation: 0.13141699549848385
	p-Value: 2.421688045730187e-19
diffuse_horizontal_illumination_klux
	correlation: 0.165079501150945
	p-Value: 9.935385877302877e-30
zenith_illumination
	correlation: nan
	p-Value: nan
sunlight_min
	correlation: -0.0076555059531493875
	p-Value: 0.6019724008575108
ceiling_height_meters
	correlation: 0.03866866718563036
	p-Value: 0.008403173691543083
sky_layer_1
	correlation: -0.06413919097747015
	p-Value: 1.2187044686069805e-05
sky_layer_2
	correlation