In [1]:
import numpy as np 
import pandas as pd

In [2]:
df = pd.read_csv("Dataset_CDS/train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
444,445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S
488,489,0,3,"Somerton, Mr. Francis William",male,30.0,0,0,A.5. 18509,8.05,,S
798,799,0,3,"Ibrahim Shawah, Mr. Yousseff",male,30.0,0,0,2685,7.2292,,C
181,182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C
536,537,0,1,"Butt, Major. Archibald Willingham",male,45.0,0,0,113050,26.55,B38,S


In [5]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin','Embarked'], inplace=True)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.9+ KB


In [10]:
x = df.drop(['Survived'], axis=1)
y = df['Survived']

In [11]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y, test_size=0.2, random_state=42)

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [13]:
transformer = ColumnTransformer(
    transformers=[
        ('tf1', OneHotEncoder(sparse_output=False, drop='first'),['Sex']),
        ('tf2', SimpleImputer(),['Age','Fare','Pclass','SibSp','Parch'])
],remainder='passthrough')

In [14]:
x_train_new = transformer.fit_transform(x_train)
x_test_new = transformer.transform(x_test)

In [22]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(max_depth=3, random_state=42)

In [23]:
dt_model.fit(x_train_new, y_train)

In [24]:
y_predict1 = dt_model.predict(x_train_new)
y_predict2 = dt_model.predict(x_test_new)

In [25]:
from sklearn.metrics import accuracy_score, classification_report
train_accuracy = accuracy_score(y_train, y_predict1)
test_accuracy = accuracy_score(y_test, y_predict2)
print(train_accuracy)
print(test_accuracy)
print(classification_report(y_test, y_predict2))

0.8342696629213483
0.7988826815642458
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179



In [19]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# 1. Define the Pipeline (We don't need StandardScaler for Decision Trees!)
# Decision Trees are not sensitive to feature scale, so the scaler is optional.
pipe_dt = Pipeline([
    # We will exclude scaler for simplicity, as it's not strictly necessary here
    ('dt', DecisionTreeClassifier(random_state=42)) 
])

# 2. Define the Search Grid
# Let's test depths from 2 up to 15.
param_grid_dt = {
    'dt__max_depth': list(range(2, 16))
}

# 3. Execute Grid Search
grid_search_dt = GridSearchCV(
    estimator=pipe_dt, 
    param_grid=param_grid_dt, 
    cv=5,                 
    scoring='accuracy',   
    n_jobs=-1             
)

# Fit the search on the training data
grid_search_dt.fit(x_train_new, y_train)

In [20]:
# 1. Print the best hyperparameters found (the optimal max_depth)
print("Best Hyperparameters Found:")
print(grid_search_dt.best_params_)

# 2. Print the score achieved by those parameters (CV score)
print(f"\nBest Cross-Validation Score: {grid_search_dt.best_score_:.4f}")

# 3. Get the best estimator (the pruned model)
best_dt_model = grid_search_dt.best_estimator_

Best Hyperparameters Found:
{'dt__max_depth': 3}

Best Cross-Validation Score: 0.8202


In [21]:
from sklearn.metrics import accuracy_score, classification_report

# Generate predictions on the unseen testing set
y_test_pred_tuned = best_dt_model.predict(x_test_new)

# Calculate Final Accuracy
final_test_accuracy = accuracy_score(y_test, y_test_pred_tuned)
print(f"\nFINAL PRUNED DECISION TREE TEST ACCURACY: {final_test_accuracy:.4f}")

# Print the detailed Classification Report
print("\nCLASSIFICATION REPORT:")
print(classification_report(y_test, y_test_pred_tuned))


FINAL PRUNED DECISION TREE TEST ACCURACY: 0.7989

CLASSIFICATION REPORT:
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       105
           1       0.80      0.69      0.74        74

    accuracy                           0.80       179
   macro avg       0.80      0.78      0.79       179
weighted avg       0.80      0.80      0.80       179

