In [1]:
import pandas as pd

df = pd.read_csv('titanic.csv')

print("Null values before cleaning:")
print(df.isnull().sum())

df_cleaned = df.dropna()

print("\nNull values after cleaning:")
print(df_cleaned.isnull().sum())


Null values before cleaning:
PassengerId      1
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

Null values after cleaning:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [2]:
target = df_cleaned['Survived']

features = df_cleaned.drop(columns=['Survived'])

print("Target variable (y):")
print(target.head())

print("\nIndependent variables (X):")
print(features.head())


Target variable (y):
12    1
14    1
24    1
26    1
28    0
Name: Survived, dtype: int64

Independent variables (X):
    PassengerId  Pclass                                               Name  \
12        904.0       1      Snyder, Mrs. John Pillsbury (Nelle Stevenson)   
14        906.0       1  Chaffee, Mrs. Herbert Fuller (Carrie Constance...   
24        916.0       1    Ryerson, Mrs. Arthur Larned (Emily Maria Borie)   
26        918.0       1                       Ostby, Miss. Helene Ragnhild   
28        920.0       1                            Brady, Mr. John Bertram   

       Sex   Age  SibSp  Parch       Ticket      Fare            Cabin  \
12  female  23.0      1      0        21228   82.2667              B45   
14  female  47.0      1      0  W.E.P. 5734   61.1750              E31   
24  female  48.0      1      3     PC 17608  262.3750  B57 B59 B63 B66   
26  female  22.0      0      1       113509   61.9792              B36   
28    male  41.0      0      0       113054

In [3]:
numeric_features = features.select_dtypes(include=['number'])

print("Numeric columns in the dataset:")
print(numeric_features.head())


Numeric columns in the dataset:
    PassengerId  Pclass   Age  SibSp  Parch      Fare
12        904.0       1  23.0      1      0   82.2667
14        906.0       1  47.0      1      0   61.1750
24        916.0       1  48.0      1      3  262.3750
26        918.0       1  22.0      0      1   61.9792
28        920.0       1  41.0      0      0   30.5000


In [4]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for train_index, test_index in kf.split(numeric_features):
    print("TRAIN:", train_index, "TEST:", test_index)


TRAIN: [ 1  2  3  5  6  7  8  9 11 13 14 15 16 17 19 20 21 23 24 25 27 28 29 30
 31 32 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 54 55 56
 57 58 59 60 61 62 63 66 69 70 71 72 73 74 75 77 78 81 82 85 86] TEST: [ 0  4 10 12 18 22 26 33 53 64 65 67 68 76 79 80 83 84]
TRAIN: [ 0  1  2  3  4  6  8 10 12 13 14 15 17 18 19 20 21 22 23 24 25 26 27 29
 32 33 36 37 38 41 43 46 47 48 49 50 51 52 53 54 56 57 58 59 60 61 62 63
 64 65 67 68 69 70 71 72 74 75 76 77 78 79 80 81 82 83 84 85 86] TEST: [ 5  7  9 11 16 28 30 31 34 35 39 40 42 44 45 55 66 73]
TRAIN: [ 0  1  2  4  5  7  9 10 11 12 14 15 16 18 20 21 22 23 26 27 28 29 30 31
 32 33 34 35 37 39 40 41 42 43 44 45 46 48 50 51 52 53 54 55 57 58 59 60
 62 63 64 65 66 67 68 70 71 73 74 75 76 77 79 80 81 82 83 84 85 86] TEST: [ 3  6  8 13 17 19 24 25 36 38 47 49 56 61 69 72 78]
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 16 17 18 19 20 21 22 23 24
 25 26 28 29 30 31 33 34 35 36 37 38 39 40 42 44 45 47 49 51 52 53 55 56
 59 60 

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

model = DecisionTreeClassifier(random_state=42)
fold_accuracies = []

for train_index, test_index in kf.split(numeric_features):
    X_train, X_test = numeric_features.iloc[train_index], numeric_features.iloc[test_index]
    y_train, y_test = target.iloc[train_index], target.iloc[test_index]
    
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    fold_accuracies.append(accuracy)

print("Accuracies for each fold:", fold_accuracies)
print("Average accuracy:", sum(fold_accuracies) / len(fold_accuracies))


Accuracies for each fold: [0.5555555555555556, 0.3888888888888889, 0.5882352941176471, 0.47058823529411764, 0.5294117647058824]
Average accuracy: 0.5065359477124183


In [6]:
from sklearn.model_selection import cross_val_score

cross_val_accuracies = cross_val_score(model, numeric_features, target, cv=5)

print("Cross-validation accuracies:", cross_val_accuracies)
print("Average cross-validation accuracy:", cross_val_accuracies.mean())


Cross-validation accuracies: [0.38888889 0.44444444 0.29411765 0.52941176 0.47058824]
Average cross-validation accuracy: 0.42549019607843136
