In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [47]:
df = pd.read_csv('train_data.csv')

df.head()


Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,z217,z218,z219,z220,z221,z222,subject,phase,state,output
0,0,0,0,1,-323.106605,2.694366,-1.98752,261.092784,0.013704,0.0001,...,-0.00493,-0.005554,5.246375,-7.534092,3.530736,-0.539045,K,3,C,1
1,0,0,0,1,-376.084691,0.969696,-6.933765,355.311648,0.030292,-0.000153,...,0.022757,0.052506,-3.727741,-2.854443,-0.699268,-0.054074,A,4,C,1
2,0,0,0,0,91.955425,2.621643,-2.581162,51.357206,0.036668,-0.000104,...,-0.086813,-0.101497,-7.510594,19.564182,-17.00813,4.945392,D,3,C,1
3,0,0,0,1,-391.814586,1.866914,-2.510799,382.900317,0.007947,-2.8e-05,...,0.030856,-0.161398,-6.435819,2.174453,-0.153956,-0.003958,G,2,C,0
4,0,0,0,0,-363.823732,2.951346,-3.726368,330.527539,0.010074,-4e-06,...,-0.017226,-0.016454,-2.581403,3.011932,-1.281361,0.192647,C,2,C,1


In [48]:
# check for missing data
for col in df.columns:
    if df[col].isna().any():
        print(col)

print('No Missing Data')

No Missing Data


In [49]:
# find the value counts for the target variable, phase, and state
print(df['output'].value_counts(normalize=True) * 100)
print(df['phase'].value_counts(normalize=True) * 100)
print(df['state'].value_counts(normalize=True) * 100)   
print(f"Shape of the dataframe: {df.shape}")



output
1    84.489529
0    15.510471
Name: proportion, dtype: float64
phase
4    28.163176
1    26.505236
2    23.189354
3    22.142234
Name: proportion, dtype: float64
state
C    45.942408
B    32.766143
D    12.565445
A     8.682373
E     0.043630
Name: proportion, dtype: float64
Shape of the dataframe: (4584, 670)


#### Quick Observations
1. The dataset has 4584 rows and 13 columns.
2. The target variable is 'output' which has two classes: 0 and 1.
3. The dataset is imbalanced with respect to the target variable.
4. The dataset has no missing values.
5. The state variable seems a somewhat imbalanced ()
6. Disproportionate state E. Perhaps drop the value


In [50]:
# Remove rows where 'state' is 'E'
df = df[df['state'] != 'E'].reset_index(drop=True)

df_test = pd.read_csv('test_data.csv')
df_test = df_test[df_test['state'] != 'E'].reset_index(drop=True)

df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,z217,z218,z219,z220,z221,z222,subject,phase,state,output
0,0,0,0,1,-323.106605,2.694366,-1.98752,261.092784,0.013704,0.0001,...,-0.00493,-0.005554,5.246375,-7.534092,3.530736,-0.539045,K,3,C,1
1,0,0,0,1,-376.084691,0.969696,-6.933765,355.311648,0.030292,-0.000153,...,0.022757,0.052506,-3.727741,-2.854443,-0.699268,-0.054074,A,4,C,1
2,0,0,0,0,91.955425,2.621643,-2.581162,51.357206,0.036668,-0.000104,...,-0.086813,-0.101497,-7.510594,19.564182,-17.00813,4.945392,D,3,C,1
3,0,0,0,1,-391.814586,1.866914,-2.510799,382.900317,0.007947,-2.8e-05,...,0.030856,-0.161398,-6.435819,2.174453,-0.153956,-0.003958,G,2,C,0
4,0,0,0,0,-363.823732,2.951346,-3.726368,330.527539,0.010074,-4e-06,...,-0.017226,-0.016454,-2.581403,3.011932,-1.281361,0.192647,C,2,C,1


In [51]:
categorical_features = ['state', 'subject']
ordinal_features = ['phase']
target_feature = ['output']

# numerical features
numerical_features = [col for col in df.columns if col not in categorical_features + ordinal_features + target_feature]

assert len(categorical_features) + len(ordinal_features) + len(numerical_features) + len(target_feature) == len(df.columns)

In [52]:
# Handling Categorical Data
df = pd.get_dummies(df, columns=['state', 'subject'], drop_first=True).astype(int)
df_test = pd.get_dummies(df_test, columns=['state', 'subject'], drop_first=True).astype(int)

# handle ordinal features
df['phase'] = df['phase'].astype(int)
df_test['phase'] = df_test['phase'].astype(int)

df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,subject_B,subject_C,subject_D,subject_F,subject_G,subject_H,subject_I,subject_K,subject_L,subject_M
0,0,0,0,1,-323,2,-1,261,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,1,-376,0,-6,355,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,91,2,-2,51,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,-391,1,-2,382,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,-363,2,-3,330,0,0,...,0,1,0,0,0,0,0,0,0,0


### Feature Scaling

In [53]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])
df_test[numerical_features] = scaler.transform(df_test[numerical_features])

df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,...,subject_B,subject_C,subject_D,subject_F,subject_G,subject_H,subject_I,subject_K,subject_L,subject_M
0,0.0,0.0,0.0,1.0,0.169283,0.5,0.72,0.443878,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
1,0.0,0.0,0.0,1.0,0.109865,0.0,0.52,0.603741,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.633408,0.5,0.68,0.086735,0.0,0.0,...,0,0,1,0,0,0,0,0,0,0
3,0.0,0.0,0.0,1.0,0.093049,0.25,0.68,0.64966,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.124439,0.5,0.64,0.561224,0.0,0.0,...,0,1,0,0,0,0,0,0,0,0


In [55]:
df.to_csv('train_data_processed.csv', index=False)
df_test.to_csv('test_data_processed.csv', index=False)