In [41]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

from smoker_status.config import RAW_DATA_DIR
from smoker_status.features import create_encoded_X

import numpy as np
import pandas as pd

In [43]:
df_train = pd.read_csv(RAW_DATA_DIR / "train.csv")
df_test = pd.read_csv(RAW_DATA_DIR / "test.csv")

df_train.head()

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1


In [44]:
X = df_train.drop(columns=["smoking", "id"])
y = df_train["smoking"]

## Finding the best random state for Decision Tree Classifier

In [59]:
# Define the range of random states to test
random_states = range(1,51)
best_accuracy = 0
best_random_state = None

# Loop through each random state and evaluate the model's accuracy
for state in random_states:
    # Split the data with the current random state
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=10)

    # Train the decision tree model
    dt = DecisionTreeClassifier(random_state=state)
    dt.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    y_pred = dt.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    # Update the best random state if this one is better
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_random_state = state

print(f"Best Random State: {best_random_state}, Accuracy: {best_accuracy}")

Best Random State: 10, Accuracy: 0.6968274649545547


In [60]:
X_trans = create_encoded_X(X)
X_trans.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),systolic,relaxation,fasting blood sugar,Cholesterol,...,hearing(right) - normal,hearing(right) - abnormal,Urine protein - negative,Urine protein - trace,Urine protein - 1+,Urine protein - 2+,Urine protein - 3+,Urine protein - 4+,dental caries - nonpresent,dental caries - present
0,55,165,60,81.0,0.5,0.6,135,87,94,172,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,70,165,65,89.0,0.6,0.7,146,83,147,194,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,20,170,75,81.0,0.4,0.5,118,75,79,178,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,35,180,95,105.0,1.5,1.2,131,88,91,180,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,30,165,60,80.5,1.5,1.0,121,76,91,155,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Testing accuracy after One Hot Encoding

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.4, random_state=10)
edt = DecisionTreeClassifier(random_state=10)
edt.fit(X_train, y_train)
y_pred = edt.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Encoded Accuracy: Random State: {best_random_state}, Accuracy: {accuracy}")

Encoded Accuracy: Random State: 10, Accuracy: 0.6940018523460434
