In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
def save_df(path, df):
    df.to_csv(path, index=False)

# $1^{st} $ Dataset. [Heart Disease Dataset](https://www.kaggle.com/datasets/yasserh/heart-disease-dataset)


In [3]:
heart_data = data = pd.read_csv("heart.csv")

heart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [4]:
heart_data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [5]:
X = heart_data.drop("target", axis = 1)
y = heart_data["target"]

In [6]:
heart_categorical = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
scaler = StandardScaler() #MinMaxScaler()
X[list(set(X.columns) - set(heart_categorical))] = scaler.fit_transform(X[list(set(X.columns) - set(heart_categorical))])

In [7]:
X = pd.get_dummies(X, columns = heart_categorical)
X.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,0.952197,0.763956,-0.256334,0.015443,1.087338,0,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
1,-1.915313,-0.092738,0.072199,1.633471,2.122573,0,1,0,0,1,...,0,1,0,0,0,0,0,0,1,0
2,-1.474158,-0.092738,-0.816773,0.977514,0.310912,1,0,0,1,0,...,1,1,0,0,0,0,0,0,1,0
3,0.180175,-0.663867,-0.198357,1.239897,-0.206705,0,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0
4,0.290464,-0.663867,2.08205,0.583939,-0.379244,1,0,1,0,0,...,1,1,0,0,0,0,0,0,1,0


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
X_train.shape, X_test.shape

((212, 30), (91, 30))

In [10]:
X_train.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,sex_0,sex_1,cp_0,cp_1,cp_2,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
124,-1.694735,-2.148802,-0.9134,1.283627,-0.896862,1,0,0,0,1,...,1,1,0,0,0,0,0,0,1,0
72,-2.797624,-0.092738,-0.816773,2.289429,-0.896862,0,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0
15,-0.481558,-0.663867,-0.52689,0.365287,0.483451,1,0,0,0,1,...,0,1,0,0,0,0,0,0,1,0
10,-0.040403,0.478391,-0.140381,0.452748,0.138373,0,1,1,0,0,...,1,1,0,0,0,0,0,0,1,0
163,-1.805024,0.364165,-1.377212,1.021244,-0.896862,0,1,0,0,1,...,1,0,0,0,0,1,0,0,1,0


In [12]:
# save_df("heart_prc.csv", X)

# $2^{nd} $ Dataset. [Body Signal Smoking](https://www.kaggle.com/datasets/kukuroo3/body-signal-of-smoking)


In [13]:
smoking_data = data = pd.read_csv("smoking.csv")

smoking_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55692 entries, 0 to 55691
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   55692 non-null  int64  
 1   gender               55692 non-null  object 
 2   age                  55692 non-null  int64  
 3   height(cm)           55692 non-null  int64  
 4   weight(kg)           55692 non-null  int64  
 5   waist(cm)            55692 non-null  float64
 6   eyesight(left)       55692 non-null  float64
 7   eyesight(right)      55692 non-null  float64
 8   hearing(left)        55692 non-null  float64
 9   hearing(right)       55692 non-null  float64
 10  systolic             55692 non-null  float64
 11  relaxation           55692 non-null  float64
 12  fasting blood sugar  55692 non-null  float64
 13  Cholesterol          55692 non-null  float64
 14  triglyceride         55692 non-null  float64
 15  HDL                  55692 non-null 

In [14]:
smoking_data['smoking'].value_counts()

0    35237
1    20455
Name: smoking, dtype: int64

In [15]:
smoking_data.head()

Unnamed: 0,ID,gender,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),...,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,oral,dental caries,tartar,smoking
0,0,F,40,155,60,81.3,1.2,1.0,1.0,1.0,...,12.9,1.0,0.7,18.0,19.0,27.0,Y,0,Y,0
1,1,F,40,160,60,81.0,0.8,0.6,1.0,1.0,...,12.7,1.0,0.6,22.0,19.0,18.0,Y,0,Y,0
2,2,M,55,170,60,80.0,0.8,0.8,1.0,1.0,...,15.8,1.0,1.0,21.0,16.0,22.0,Y,0,N,1
3,3,M,40,165,70,88.0,1.5,1.5,1.0,1.0,...,14.7,1.0,1.0,19.0,26.0,18.0,Y,0,Y,0
4,4,F,40,155,60,86.0,1.0,1.0,1.0,1.0,...,12.5,1.0,0.6,16.0,14.0,22.0,Y,0,N,0


In [16]:
X = smoking_data.drop("smoking", axis = 1)
y = smoking_data["smoking"]

In [17]:
smoking_categorical = ['dental caries', 'gender', 'oral', 'tartar'] #'ID'
mm_scaler = MinMaxScaler() #StandardScaler()

to_scale = list(set(X.columns) - set(smoking_categorical + ['ID']))
X[to_scale] = mm_scaler.fit_transform(X[to_scale])

In [18]:
X = pd.get_dummies(X, columns = smoking_categorical, drop_first=True)

In [19]:
X.head()

Unnamed: 0,ID,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries_1,gender_M,tartar_Y
0,0,0.307692,0.416667,0.285714,0.388462,0.112245,0.091837,0.0,0.0,0.254438,...,0.06724,0.493827,0.0,0.052174,0.009195,0.006179,0.026052,0,0,1
1,1,0.307692,0.5,0.285714,0.384615,0.071429,0.05102,0.0,0.0,0.284024,...,0.067778,0.481481,0.0,0.043478,0.012261,0.006179,0.017034,0,0,1
2,2,0.538462,0.666667,0.285714,0.371795,0.071429,0.071429,0.0,0.0,0.39645,...,0.080689,0.67284,0.0,0.078261,0.011494,0.005149,0.021042,0,1,0
3,3,0.307692,0.583333,0.380952,0.474359,0.142857,0.142857,0.0,0.0,0.171598,...,0.121033,0.604938,0.0,0.078261,0.009962,0.008582,0.017034,0,1,1
4,4,0.307692,0.416667,0.285714,0.448718,0.091837,0.091837,0.0,0.0,0.289941,...,0.05702,0.469136,0.0,0.043478,0.007663,0.004463,0.021042,0,0,0


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [21]:
X_train.head()

Unnamed: 0,ID,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries_1,gender_M,tartar_Y
36967,46200,0.461538,0.416667,0.285714,0.24359,0.071429,0.071429,0.0,0.0,0.171598,...,0.065627,0.611111,0.0,0.06087,0.015326,0.005836,0.019038,0,0,0
45466,4475,0.307692,0.416667,0.190476,0.282051,0.112245,0.091837,0.0,0.0,0.301775,...,0.051641,0.45679,0.0,0.052174,0.009195,0.004119,0.011022,0,0,1
47799,16459,0.307692,0.583333,0.238095,0.320513,0.142857,0.142857,0.0,0.0,0.230769,...,0.066165,0.604938,0.0,0.086957,0.006897,0.00309,0.012024,0,1,1
33247,41516,0.384615,0.5,0.190476,0.294872,0.091837,0.112245,0.0,0.0,0.402367,...,0.107047,0.506173,0.0,0.043478,0.007663,0.002403,0.01002,1,0,0
19718,24620,0.461538,0.583333,0.285714,0.269231,0.142857,0.142857,0.0,1.0,0.248521,...,0.071006,0.512346,0.0,0.052174,0.015326,0.005836,0.019038,1,0,0


In [22]:
save_df("smoking_prc.csv", X)

# $3^{rd} $ Dataset. [Employee Dataset](https://www.kaggle.com/datasets/tawfikelmetwally/employee-dataset)


In [23]:
employee_data = pd.read_csv("employee.csv")

employee_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4653 entries, 0 to 4652
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4653 non-null   object
 1   JoiningYear                4653 non-null   int64 
 2   City                       4653 non-null   object
 3   PaymentTier                4653 non-null   int64 
 4   Age                        4653 non-null   int64 
 5   Gender                     4653 non-null   object
 6   EverBenched                4653 non-null   object
 7   ExperienceInCurrentDomain  4653 non-null   int64 
 8   LeaveOrNot                 4653 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 327.3+ KB


In [24]:
employee_data['LeaveOrNot'].value_counts()

0    3053
1    1600
Name: LeaveOrNot, dtype: int64

In [25]:
X = employee_data.drop("LeaveOrNot", axis = 1)
y = employee_data["LeaveOrNot"]

In [26]:
empl_categorical = ['Education', 'City', 'Gender', 'EverBenched']
scaler = StandardScaler()

to_scale = list(set(X.columns) - set(empl_categorical))
X[to_scale] = mm_scaler.fit_transform(X[to_scale])

In [27]:
X = pd.get_dummies(X, columns = empl_categorical, drop_first=True)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [29]:
X_train

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,Education_Masters,Education_PHD,City_New Delhi,City_Pune,Gender_Male,EverBenched_Yes
2850,0.166667,1.0,0.421053,0.000000,1,0,1,0,1,0
589,0.000000,1.0,0.157895,0.428571,0,0,0,0,1,0
2086,0.833333,0.5,0.368421,0.285714,1,0,0,1,1,0
445,0.000000,1.0,0.105263,0.285714,1,0,0,1,1,0
3654,0.833333,0.5,0.684211,0.285714,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...
4426,0.333333,1.0,0.526316,0.142857,0,0,0,0,1,0
466,0.166667,1.0,0.210526,0.571429,0,0,0,0,1,1
3092,0.666667,1.0,0.894737,0.142857,0,0,0,0,0,0
3772,0.500000,1.0,1.000000,0.142857,0,0,0,0,1,0


In [30]:
save_df("employee_prc.csv", X)