# Exercise 1 - PREPROCESSING

In [3]:
# 1. Import dataset and libraries
import pandas as pd

file1 = pd.read_csv("Data.csv")

fl1 = pd.DataFrame(file1)

fl1.head(10)

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


1. X = df.iloc[:, :-1].values
- iloc[:, :-1] → Selects all rows (:) and all columns except the last one (:-1).
- .values → Converts the selected data into a NumPy array.
- X now contains all feature columns (independent variables).
2. y = df.iloc[:, -1].values
- iloc[:, -1] → Selects the last column (-1) from the dataset.
- .values → Converts it into a NumPy array.
- y now contains only the target column (dependent variable).

It is crucial in machine learning and data preprocessing because it:

- Separates the independent variables (X) from the dependent variable (y).
- Prepares the dataset for training a model by ensuring that input features and output labels are distinct.
- Avoids target leakage, where the target column could influence feature selection or scaling.
- : → Selects all rows.
- :-1 → Selects all columns except the last one.
- .values → Converts it into a NumPy array (instead of a Pandas DataFrame).
- We are excluding last column since this is target variable that we need to predict.

In [4]:

X = file1.iloc[:, :-1].values
Y = file1.iloc[:, -1].values
print("X (Features):\n", X)
print("\nY (Target Variable):\n", Y)


X (Features):
 [['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]

Y (Target Variable):
 ['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


In [5]:
# 2. Identify and handle missing data

file1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Country    10 non-null     object 
 1   Age        9 non-null      float64
 2   Salary     9 non-null      float64
 3   Purchased  10 non-null     object 
dtypes: float64(2), object(2)
memory usage: 452.0+ bytes


In [6]:
file1.duplicated().any()

False

In [7]:
file1.isnull().sum()

Unnamed: 0,0
Country,0
Age,1
Salary,1
Purchased,0


In [8]:
# Handle Missing Values
# If you decide to use this method only, then you would not need to create a copy
# Rather immediately work on that file
file2 = file1.copy()

print("Before:", file2.shape)

file2.dropna(inplace = True)

print("After:", file2.shape)

Before: (10, 4)
After: (8, 4)


In [9]:
import warnings
warnings.filterwarnings('ignore')

#Fillna

file3 = file1.copy()

#This just chooses numeric columns
file3.fillna(file3.select_dtypes(include=["number"]).mean(), inplace=True)

print(file3.isnull().sum())

file3

Country      0
Age          0
Salary       0
Purchased    0
dtype: int64


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,63777.777778,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [10]:
# Third way
# SimpleImputer replaces missing values
# (NaN) using a specified strategy such as mean, median, or mode
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# ✔ Creates an instance of SimpleImputer
# ✔ Specifies that missing values (np.nan) should be replaced with the column mean.
# ✔ This object does not modify data yet—it is just prepared to perform imputation.

imputer.fit(X[:, 1:3])
# ✔ Computes the mean of the selected columns (1 and 2) from X.
# ✔ It does NOT modify X yet—it only learns the mean of those columns.

X[:, 1:3] = imputer.transform(X[:, 1:3])
# ✔ Replaces all missing values in X[:, 1:3] with the computed means from the fit() step.
# ✔ Updates X with the new values.

#this just applies it to the X variable, if you wanted to apply it to the file1
# then you should use file1 var

# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# file1[['Salary']] = imputer.fit_transform(file1[['Salary']])



In [11]:
X

array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

In [12]:
file1

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [13]:
# Encode categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# ✔ ColumnTransformer allows applying transformations (like One-Hot Encoding) to specific columns.
# ✔ OneHotEncoder is used to convert categorical values (e.g., Country) into numeric format (0s and 1s).
# ✔ numpy is used to store the transformed data in an array
# [0] use the first column
#  remainder='passthrough' keeps all other columns unchanged.
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

#you should have three columns for country now since it had three ctageories
# however, if you execute this function multiple times, you would then have more columns

In [14]:
X

array([[1.0, 0.0, 0.0, 44.0, 72000.0],
       [0.0, 0.0, 1.0, 27.0, 48000.0],
       [0.0, 1.0, 0.0, 30.0, 54000.0],
       [0.0, 0.0, 1.0, 38.0, 61000.0],
       [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
       [1.0, 0.0, 0.0, 35.0, 58000.0],
       [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
       [1.0, 0.0, 0.0, 48.0, 79000.0],
       [0.0, 1.0, 0.0, 50.0, 83000.0],
       [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object)

In [15]:
#Solution 2

file2

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [16]:
pd.get_dummies(file2)
# ✔ Machine learning models cannot work with text (France, Spain, Germany), so we convert them into numbers.
# ✔ It helps create a numeric representation of categorical data.
# ✔ It makes the dataset easier to process and analyze.


Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain,Purchased_No,Purchased_Yes
0,44.0,72000.0,True,False,False,True,False
1,27.0,48000.0,False,False,True,False,True
2,30.0,54000.0,False,True,False,True,False
3,38.0,61000.0,False,False,True,True,False
5,35.0,58000.0,True,False,False,False,True
7,48.0,79000.0,True,False,False,False,True
8,50.0,83000.0,False,True,False,True,False
9,37.0,67000.0,True,False,False,False,True


In [17]:
# Solution 3 - Label Encoder
# LabelEncoder converts categorical text values into
# numbers so that machine learning models can process them.
# ✔ No → 0
# ✔ Yes → 1

# Use LabelEncoder when a column has only two categories (binary).
# Use OneHotEncoder for categorical data with multiple categories

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(Y)

y

array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1])

In [18]:
#Splitting the data
# Splitting data into training and test groups is a crucial step in machine learning.
# It ensures that we train the model on one part of the data and test its performance on unseen data.

# train_test_split(X, y, test_size=0.2, random_state=1)	Splits X (features) and y (target variable) into training and test sets.
# test_size=0.2	- 20% of the data goes into test set, 80% goes into training set.
# random_state=1 - Ensures that the split is consistent every time you run the code.
# X_train, X_test	- Features split into training (X_train) and test (X_test) sets.
# y_train, y_test	- Target variable split into training (y_train) and test (y_test) sets.

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [None]:
print(X_test)

[[0.0 1.0 0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 1.0 0.0 0.0 37.0 67000.0]]


In [19]:
#Feature scaling - MinMax scaling
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()

X_train[:, 3:] = mm.fit_transform(X_train[:, 3:])
X_test[:, 3:] = mm.transform(X_test[:, 3:])

print(X_train[:, 3:])

# We are applying MinMax Scaling only from column index 3 onwards because the
# first three columns contain categorical data (One-Hot Encoded values).
# Columns 0, 1, 2 → One-Hot Encoded categorical features (do not need scaling).
# Columns 3 and beyond → Numerical values (e.g., Age, Salary) need scaling.

# When we are using feature scaling
# ✔ When you need all values between 0 and 1 (e.g., neural networks).
# ✔ When you want to preserve original data distribution.
# ✔ When your dataset has no extreme outliers.

[[0.5120772946859904 0.11428571428571432]
 [0.5652173913043479 0.45079365079365075]
 [0.7391304347826089 0.6857142857142855]
 [0.4782608695652175 0.37142857142857144]
 [0.0 0.0]
 [0.9130434782608696 0.8857142857142857]
 [1.0 1.0]
 [0.34782608695652173 0.2857142857142856]]


In [21]:
#Standard scaling
from sklearn.preprocessing import StandardScaler
sta = StandardScaler()
X_train[:, 3:] = sta.fit_transform(X_train[:, 3:])
X_test[:, 3:] = sta.transform(X_test[:, 3:])

print(X_train[:, 3:])

# ✔ When your data follows a normal (Gaussian) distribution.
# ✔ When you need a mean of 0 and standard deviation of 1 (e.g., SVM, logistic regression, k-means).
# ✔ When your dataset contains outliers (MinMax scaling is sensitive to outliers).

[[-0.19159184384578568 -1.0781259408412427]
 [-0.01411729375705786 -0.07013167641635429]
 [0.5667085065333243 0.6335624327104543]
 [-0.30453019390224856 -0.3078661727429787]
 [-1.9018011447007988 -1.4204636155515824]
 [1.1475343068237058 1.2326533634535488]
 [1.4379472069688963 1.5749910381638885]
 [-0.7401495441200353 -0.5646194287757338]]


# Exercise 2 - EDA


In [74]:
# 1. Load data

file7 = pd.read_csv("disaster_data.csv")

In [75]:
file7.shape

(3678186, 22)

In [76]:
# 2. Remove duplicates

file7.duplicated().any()
file7 = file7.drop_duplicates()

In [77]:
file7.isnull().sum()

Unnamed: 0,0
begin_year_month,0
begin_day,0
begin_time,0
episode_id,0
event_id,0
state,1
year,0
month_name,0
event_type,0
zone_type,0


In [78]:
percentage=(file7.isnull().sum()/file7.isnull().count()*100).sort_values(ascending=False)
percentage

Unnamed: 0,0
tornado_f_scale,95.910148
flood_cause,94.185665
magnitude_type,74.434721
state,5.6e-05
begin_year_month,0.0
deaths_direct,0.0
female_fatalities,0.0
male_fatalities,0.0
magnitude,0.0
property_damage,0.0


In [82]:
file7 = file7.drop(["tornado_f_scale","flood_cause", "magnitude_type" ], axis=1)
file7.columns

Index(['begin_year_month', 'begin_day', 'begin_time', 'episode_id', 'event_id',
       'state', 'year', 'month_name', 'event_type', 'zone_type',
       'injuries_direct', 'injuries_indirect', 'deaths_direct',
       'deaths_indirect', 'property_damage', 'magnitude', 'male_fatalities',
       'female_fatalities', 'avg_age  '],
      dtype='object')

In [72]:
file7.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1781287 entries, 0 to 3678184
Data columns (total 19 columns):
 #   Column             Dtype  
---  ------             -----  
 0   begin_year_month   int64  
 1   begin_day          int64  
 2   begin_time         int64  
 3   episode_id         int64  
 4   event_id           int64  
 5   state              object 
 6   year               int64  
 7   month_name         object 
 8   event_type         object 
 9   zone_type          object 
 10  injuries_direct    int64  
 11  injuries_indirect  int64  
 12  deaths_direct      int64  
 13  deaths_indirect    int64  
 14  property_damage    float64
 15  magnitude          float64
 16  male_fatalities    int64  
 17  female_fatalities  int64  
 18  avg_age            float64
dtypes: float64(3), int64(12), object(4)
memory usage: 271.8+ MB


In [83]:
# Find number of total events (rows)

numberOfRows = file7.shape

numberOfRows



(1781287, 19)

In [92]:
# Find the number/frequency of each disaster type
num_disaster_types=len(file7.event_type.unique())
num_disaster_types

69

In [88]:
# Find the number/frequency of each disaster type
file7["event_type"].value_counts()


Unnamed: 0_level_0,count
event_type,Unnamed: 1_level_1
Thunderstorm Wind,500343
Hail,388685
Flash Flood,96340
High Wind,82143
Winter Storm,81607
...,...
TORNADO/WATERSPOUT,1
THUNDERSTORM WINDS/HEAVY RAIN,1
Marine Lightning,1
HAIL FLOODING,1


In [90]:
# second way
events=file7.groupby(["event_type"]).size().sort_values(ascending=False)
events

Unnamed: 0_level_0,0
event_type,Unnamed: 1_level_1
Thunderstorm Wind,500343
Hail,388685
Flash Flood,96340
High Wind,82143
Winter Storm,81607
Tornado,74816
Winter Weather,70906
Heavy Snow,67770
Drought,66888
Flood,62096


In [93]:
# Find the most devastating disaster type by total damage
# Group by event type, and for each sum up property damage
events_damage=data.groupby(["event_type"])["property_damage"].sum().sort_values(ascending=False)[:10]
events_damage.head()

Unnamed: 0_level_0,0
property_damage,Unnamed: 1_level_1
0.0,1379216
5000.0,51981
1000.0,41662
10000.0,40340
2000.0,31538
...,...
936000.0,1
44200.0,1
934300.0,1
933000.0,1


In [95]:
# Most devastating disaster types by fatality

file7["total_deaths"]=file7["deaths_direct"] + file7["deaths_indirect"]
file7["total_injuries"]=file7["injuries_direct"]+file7["injuries_indirect"]

events_fatality=file7.groupby(["event_type"])["total_deaths"].sum().sort_values(ascending=False)[:10]
print(events_fatality.head())

events_injuries=file7.groupby(["event_type"])["total_injuries"].sum().sort_values(ascending=False)[:10]
print(events_injuries.head())

event_type
Tornado           5910
Heat              2980
Flash Flood       1748
Excessive Heat    1429
Rip Current       1198
Name: total_deaths, dtype: int64
event_type
Tornado              95086
Thunderstorm Wind    11708
Heat                 10573
Flash Flood           6537
Winter Weather        6098
Name: total_injuries, dtype: int64


In [98]:
# Add these columns to the existing columns so we can include number of dead
file7.columns=['begin_year_month', 'begin_day', 'begin_time', 'episode_id', 'event_id',
       'state', 'year', 'month_name', 'event_type', 'zone_type',
       'injuries_direct', 'injuries_indirect', 'deaths_direct',
       'deaths_indirect', 'property_damage', 'magnitude',  'male_fatalities',
       'female_fatalities', 'avg_age', 'total_deaths', 'total_injuries']

print(file7.columns)

Index(['begin_year_month', 'begin_day', 'begin_time', 'episode_id', 'event_id',
       'state', 'year', 'month_name', 'event_type', 'zone_type',
       'injuries_direct', 'injuries_indirect', 'deaths_direct',
       'deaths_indirect', 'property_damage', 'magnitude', 'male_fatalities',
       'female_fatalities', 'avg_age', 'total_deaths', 'total_injuries'],
      dtype='object')


In [100]:
#Male to female fatality ratio
deceased_data_all = file7[(file7.total_deaths != 0)]

mf_fatality_ratio = deceased_data_all.male_fatalities.sum() / deceased_data_all.female_fatalities.sum()
print('Male/female fatality ratio: {}'.format(mf_fatality_ratio))

# Top male and female fatalities
male_fatalities = deceased_data_all.groupby(['event_type'])['male_fatalities'].sum().sort_values(ascending=False).head(5)
print()
print('Most common causes of male deaths:')
print(male_fatalities)

female_fatalities = deceased_data_all.groupby(['event_type'])['female_fatalities'].sum().sort_values(ascending=False).head(5)
print()
print('Most common causes of female deaths:')
print(female_fatalities)

Male/female fatality ratio: 2.076704006075565

Most common causes of male deaths:
event_type
Heat           3406
Flash Flood    2044
Rip Current    2042
Tornado        1768
Lightning      1480
Name: male_fatalities, dtype: int64

Most common causes of female deaths:
event_type
Heat              1836
Tornado           1728
Flash Flood       1310
Winter Weather     734
Excessive Heat     600
Name: female_fatalities, dtype: int64
