# **Data Splitting Notebook**

This notebook handles:


 1. Splitting the data to training and testing for the features and the target column
 2. Handling the unbalanced number of samples for the target column

# Load Data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTENC

In [4]:
df = pd.read_csv("../data/Cleaned_Data.csv")
df.head(15)

Unnamed: 0,Age,Gender,Region,Preexisting_Condition,COVID_Strain,Symptoms,Severity,Hospitalized,ICU_Admission,Ventilator_Support,Recovered,Reinfection,Vaccination_Status,Doses_Received,Occupation,Smoking_Status,BMI,Recovery_Classification
0,69,Male,Hovedstaden,Obesity,Delta,Mild,Moderate,Yes,No,No,Yes,No,Yes,1,Healthcare,Never,27.7,Delayed Recovery
1,38,Male,Sjælland,Asthma,XBB.1.5,Mild,Moderate,No,No,No,No,No,No,0,Healthcare,Never,21.9,Typical Recovery
2,41,Female,Syddanmark,Hypertension,Beta,Mild,High,Yes,Yes,Yes,No,No,Yes,3,Unemployed,Never,22.7,Delayed Recovery
3,81,Female,Hovedstaden,Asthma,Delta,Severe,High,No,No,No,Yes,Yes,Yes,1,Office Worker,Never,27.7,Delayed Recovery
4,50,Female,Syddanmark,Cardiovascular,Delta,Mild,High,No,No,No,No,No,Yes,2,Student,Never,11.9,Delayed Recovery
5,66,Male,Sjælland,Cardiovascular,Omicron,Moderate,Moderate,No,No,No,Yes,No,Yes,3,Healthcare,Never,29.8,Delayed Recovery
6,76,Female,Sjælland,Obesity,Omicron,Moderate,Critical,Yes,Yes,No,No,No,No,0,Unemployed,Former,22.3,Delayed Recovery
7,77,Female,Sjælland,Diabetes,XBB.1.5,Moderate,Low,No,No,No,Yes,No,Yes,3,Driver,Former,24.4,Delayed Recovery
8,79,Female,Nordjylland,Hypertension,XBB.1.5,Mild,Low,No,No,No,No,No,No,0,Healthcare,Former,26.1,Typical Recovery
9,72,Female,Sjælland,Cardiovascular,Alpha,Severe,High,No,No,No,Yes,No,Yes,1,Unemployed,Current,21.2,Delayed Recovery


# Split Data

In [6]:
X = df.drop(columns=['Reinfection'])
y = df['Reinfection']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [8]:
cat_cols = df.columns.tolist()
cat_cols.remove('Age')
cat_cols.remove('BMI')
cat_cols.remove('Doses_Received')
cat_cols.remove('Reinfection')

sm = SMOTENC(random_state=42, categorical_features=cat_cols)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

In [9]:
print("After SMOTE:", y_resampled.value_counts())

After SMOTE: Reinfection
No     2172
Yes    2172
Name: count, dtype: int64


# Save the splitted data

In [11]:
X_resampled.to_csv("../data/splitted_data/X_train.csv", index=False)
y_resampled.to_csv("../data/splitted_data/y_train.csv", index=False)
X_test.to_csv("../data/splitted_data/X_test.csv", index=False)
y_test.to_csv("../data/splitted_data/y_test.csv", index=False)