# Exploratory Data Analysis 

In [1]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.model_selection import train_test_split
import plotly.express as px

# alt.renderers.enable("mimetype")

In [2]:
data_portugal = pd.read_csv("C://Projects//Forest-Fires//Portugal//data//forestfires.csv")
data_portugal

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [3]:
data_portugal.shape

(517, 13)

In [4]:
df_portugal = data_portugal.drop(['day','month'],axis = 1)

In [5]:
data_portugal.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


In [6]:
data_portugal.isnull().sum()

X        0
Y        0
month    0
day      0
FFMC     0
DMC      0
DC       0
ISI      0
temp     0
RH       0
wind     0
rain     0
area     0
dtype: int64

In [7]:
data_portugal.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [8]:
train_set, test_set = train_test_split(data_portugal, test_size = 0.2, random_state = 123)

In [9]:
train_set.shape

(413, 13)

In [10]:
train_set["X"].value_counts()

X
6    74
4    71
2    56
7    49
8    46
3    46
1    40
5    24
9     7
Name: count, dtype: int64

In [11]:
train_set["Y"].value_counts() 

Y
4    164
5    104
6     53
3     52
2     37
9      2
8      1
Name: count, dtype: int64

In [12]:
train_set["month"].value_counts() 

month
aug    149
sep    138
mar     45
jul     22
feb     17
jun     13
oct     12
apr      8
dec      6
jan      1
nov      1
may      1
Name: count, dtype: int64

In [13]:
train_set["day"].value_counts()

day
sun    73
fri    70
sat    69
mon    57
thu    51
tue    48
wed    45
Name: count, dtype: int64

In [14]:
train_set.describe()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
mean,4.62954,4.237288,90.771429,109.854237,546.031235,8.971671,18.819613,44.353511,4.085714,0.026634,13.868329
std,2.278178,1.164551,4.655424,63.576254,251.835608,4.581362,5.789594,16.476107,1.813679,0.330882,69.84273
min,1.0,2.0,50.4,2.4,7.9,0.4,4.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,90.2,61.1,433.3,6.4,15.4,33.0,2.7,0.0,0.0
50%,4.0,4.0,91.7,108.0,664.5,8.4,19.3,42.0,4.0,0.0,0.52
75%,6.0,5.0,92.9,141.3,713.9,10.7,22.9,53.0,5.4,0.0,6.58
max,9.0,9.0,96.2,291.3,860.6,56.1,33.3,99.0,9.4,6.4,1090.84


In [15]:
df_portugal.corr(method = 'spearman') 

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
X,1.0,0.492705,-0.059573,-0.07986,-0.072543,-0.010799,-0.051083,0.065841,0.026846,0.109536,0.060499
Y,0.492705,1.0,-0.009635,0.004525,-0.105404,-0.013202,-0.040541,0.050028,-0.008629,0.079107,0.046018
FFMC,-0.059573,-0.009635,1.0,0.511437,0.263049,0.783566,0.594765,-0.319549,-0.035399,0.097398,0.0253
DMC,-0.07986,0.004525,0.511437,1.0,0.558791,0.425159,0.502963,0.034604,-0.110436,0.120555,0.07192
DC,-0.072543,-0.105404,0.263049,0.558791,1.0,0.103832,0.308746,0.025869,-0.205825,0.008048,0.061633
ISI,-0.010799,-0.013202,0.783566,0.425159,0.103832,1.0,0.416404,-0.177498,0.135586,0.117454,0.012496
temp,-0.051083,-0.040541,0.594765,0.502963,0.308746,0.416404,1.0,-0.517636,-0.179908,0.026024,0.078696
RH,0.065841,0.050028,-0.319549,0.034604,0.025869,-0.177498,-0.517636,1.0,0.037157,0.181059,-0.024221
wind,0.026846,-0.008629,-0.035399,-0.110436,-0.205825,0.135586,-0.179908,0.037157,1.0,0.120509,0.053196
rain,0.109536,0.079107,0.097398,0.120555,0.008048,0.117454,0.026024,0.181059,0.120509,1.0,-0.064073


Our target variable area is highly skewed towards zero

In [16]:
fig = px.histogram(
    train_set,
    x="area",
    nbins=13, 
    title="Distribution of Area",
    labels={"area": "Area"}
)
fig.update_layout(
    width=600, 
    height=400   
)
fig.show()

Applying log transformation to deal with skewness

In [17]:
train_set["log_area"] = train_set["area"].replace(0, np.nan).apply(np.log)
fig = px.histogram(
    train_set,
    x="log_area",
    nbins=20,
    title="Area Burnt (After Log Transformation)",
    labels={"log_area": "Area Burnt (Log Scale)"},
)
fig.update_layout(
    width=1000, 
    height=500  
)
fig.show()

In [18]:
train_set["sqrt_area"] = train_set["area"].apply(np.sqrt)
fig = px.box(
    train_set,
    x="sqrt_area",
    y="day",
    color="day",
    title="Area Burnt (Square Root Transformation) by Day of Week",
    labels={
        "sqrt_area": "Area Burnt (Square Root Transformation)",
        "day": "Day of Week"
    }
)
fig.update_layout(
    width=750,
    height=450
)
fig.show()

In [19]:
train_set.columns

Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',
       'wind', 'rain', 'area', 'log_area', 'sqrt_area'],
      dtype='object')

Some months like may, november, and jan have zero observations

In [20]:
train_set["sqrt_area"] = train_set["area"].apply(np.sqrt)
fig = px.box(
    train_set,
    x="sqrt_area",
    y="month",
    color="month",
    title="Area Burnt (Square Root Transformation) by months",
    labels={
        "sqrt_area": "Area Burnt (Square Root Transformation)",
        "month": "month"
    }
)
fig.update_layout(
    width=750,
    height=450
)
fig.show()

We map the months to a new variable season

In [21]:
map_seasons = {
    "dec" : "winter",
    "jan" : "winter",
    "feb" : "winter",
    "mar" : "spring",
    "apr" : "spring",
    "may" : "spring",
    "jun" : "summer",
    "jul" : "summer", 
    "aug" : "summer",
    "sep" : "fall",
    "oct" : "fall",
    "nov" : "fall"
}

train_set["season"] = train_set["month"].map(map_seasons)
train_set["sqrt_area"] = train_set["area"].apply(np.sqrt)

fig = px.box(
    train_set,
    x="sqrt_area",
    y="season",
    color="season",
    title="Area Burnt (Square Root Transformation) by Season",
    labels={
        "sqrt_area": "Area Burnt (Square Root Transformation)",
        "season": "Season"
    }
)

# Adjust plot dimensions
fig.update_layout(
    width=750,
    height=500
)

# Display the plot
fig.show()

In [22]:
%pip install geopandas --user

Note: you may need to restart the kernel to use updated packages.


In [26]:
def classify_area(area):
    if area == 0:
        return "No Burn"
    elif area <= 10:
        return "Small Burn"
    elif area <= 50:
        return "Moderate Burn"
    else:
        return "Large Burn"

train_set["Burn Category"] = train_set["area"].apply(classify_area)

fig = px.scatter(
    train_set,
    x="X",
    y="Y",
    size="area",
    size_max=40,  
    color="Burn Category",  
    color_discrete_map={
        "No Burn": "gray",
        "Small Burn": "blue",
        "Moderate Burn": "orange",
        "Large Burn": "red"
    },
    opacity=0.7,
    labels={
        "X": "X-axis Spacial Coordinate",
        "Y": "Y-axis Spacial Coordinate",
        "area": "Burnt Area",
        "Burn Category": "Burn Category"
    },
    title="Burnt Area by Spacial Coordinates"
)

# Update layout for better appearance
fig.update_layout(
    xaxis_title="X-axis Spacial Coordinate",
    yaxis_title="Y-axis Spacial Coordinate",
    template="plotly_white",
    legend_title="Burn Category"  # Add a title to the legend
)

# Show the plot
fig.show()



In [26]:
alt.Chart(train_set).mark_circle().encode(
    x = alt.X(alt.repeat("row"), type = "quantitative"),
    y = alt.Y(alt.repeat("column"), type = "quantitative"),
    color = "season"
).properties(
    width = 110,
    height = 110
).repeat(
    column = ["FFMC", "DMC", "DC", "ISI", "temp", "RH", "wind", "rain"],
    row = ["FFMC", "DMC", "DC", "ISI", "temp", "RH", "wind", "rain"]
).configure_mark(
    opacity = 0.4
).interactive()


  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [31]:
train_df_numeric = train_set.drop(["X", "Y", "month", "day", "season", "Burn Category", "sqrt_area", "log_area"], axis=1)

corr_df = train_df_numeric.corr("spearman").stack().reset_index(name="corr")
corr_df.loc[corr_df["corr"] == 1, "corr"] = 0  
corr_df["abs"] = corr_df["corr"].abs()

fig = px.scatter(
    corr_df,
    x="level_0",
    y="level_1",
    size="abs",  
    color="corr",  
    color_continuous_scale="RdBu", 
    range_color=(-1, 1),  
    labels={
        "level_0": "Variables",
        "level_1": "Variables",
        "corr": "Correlation",
        "abs": "Absolute Correlation"
    },
    title="Correlation Matrix",
)

fig.update_layout(
    width=700,
    height=500,
    legend_title="Absolute Correlation",
    xaxis_title="Variables",
    yaxis_title="Variables",
    template="plotly_white"
)

fig.show()
