---

# Ioannou_Georgios


## Copyright © 2023 by Georgios Ioannou


---

<h1 align="center"> Train Test Split and Cross Validation </h1>


---

# LIBRARIES


In [1]:
# Import libraries.

import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer, mean_squared_error, accuracy_score
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    StratifiedKFold,
    KFold,
)
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

---

# LOAD THE DATASET


In [3]:
# Load the titanic data set into a pandas data frame.

# Read the file titanic.csv and load the data.

df = pd.read_csv("titanic.csv")

# Print/Display/Return the first 5 rows of the file

df.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# Sanity check.

df.shape

(891, 12)

In [5]:
# We will predict age base on Pclass and Fare.
# Regression task.
# Therefore, make sure to drop all Nan Age values.

df = df.dropna(subset=["age"])

In [6]:
# Sanity check.

df.shape

(714, 12)

---

# REGRESSION TASK

- ### For regression task (predicting age):
- ### Features (X) will be 'Pclass' and 'Fare'.
- ### Target variable (y) will be 'Age'.


---

train_test_split


In [7]:
# For regression task (predicting age):
# Features (X) will be 'Pclass' and 'Fare', and the target variable (y) will be 'Age'.

selected_features = ["pclass", "fare"]
X_regression = df[selected_features]
y_regression = df["age"]

# Split the data into a training and testing set using train_test_split.

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_regression, y_regression, test_size=0.2, random_state=42
)

# Create a Linear Regression model.

regression_model = LinearRegression()

# Fit the model on the training data.

regression_model.fit(X_train_reg, y_train_reg)

# Make predictions on the test data.

regression_predictions = regression_model.predict(X_test_reg)

# Calculate Mean Squared Error (MSE).

mse = mean_squared_error(y_test_reg, regression_predictions)

print(f"Mean Squared Error (MSE): {mse}")

Mean Squared Error (MSE): 158.75096330976928


---

KFold


In [8]:
# Initialize KFold with the desired number of splits.

kf = KFold(n_splits=10, random_state=42, shuffle=True)

# Create a Linear Regression model.

regression_model = LinearRegression()

# Initialize a list to store MSE for each fold.

mse_scores = []

# Perform k-fold cross-validation.

i = 0
for train_index, test_index in kf.split(X_regression):
    i += 1
    # print("Fold:", i)
    # print("train_index =", train_index)
    # print("test_index =", test_index)

    # Setup your train and test indices.

    X_train_reg, X_test_reg = (
        X_regression.iloc[train_index],
        X_regression.iloc[test_index],
    )
    y_train_reg, y_test_reg = (
        y_regression.iloc[train_index],
        y_regression.iloc[test_index],
    )

    # Reset the model.

    regression_model = LinearRegression()

    # Fit the model on the training data for each fold.

    regression_model.fit(X_train_reg, y_train_reg)

    # Make predictions on the test data for each fold.

    regression_predictions = regression_model.predict(X_test_reg)

    # Calculate the Mean Squared Error (MSE) for each fold.

    mse = mean_squared_error(y_test_reg, regression_predictions)
    print("Fold", i, "mse =", mse)
    mse_scores.append(mse)

    # NOTE: Here you can check if the current mse loss is less than the previous one and resave the model.
    # Fold 2, Fold 3, and Fold 6 performed better than the previous cell with only one train test split.

# Calculate the mean MSE over all folds.

mean_mse = sum(mse_scores) / len(mse_scores)
print(f"\nMean Squared Error (MSE) across all folds: {mean_mse}")

Fold 1 mse = 188.18373856109955
Fold 2 mse = 130.70921289621666
Fold 3 mse = 151.34768335213533
Fold 4 mse = 193.11205944097435
Fold 5 mse = 185.7043776434882
Fold 6 mse = 140.2404363282263
Fold 7 mse = 176.07890428656205
Fold 8 mse = 212.7884777586146
Fold 9 mse = 229.32235052074904
Fold 10 mse = 195.85176687126983

Mean Squared Error (MSE) across all folds: 180.3339007659336


### You can use cross_val_score from scikit-learn to perform cross-validation and calculate scores directly without the need for manually splitting the data into folds.


In [9]:
# Create a Linear Regression model.

regression_model = LinearRegression()

# Define a scoring function (negative MSE) for cross_val_score.

scoring = make_scorer(mean_squared_error, greater_is_better=False)

# Use cross_val_score for cross-validation.

mse_scores = cross_val_score(
    regression_model, X_regression, y_regression, cv=10, scoring=scoring
)

# Calculate the mean MSE over all folds.

mean_mse = -mse_scores.mean()
print(f"Mean Squared Error (MSE) across all folds: {mean_mse}")

Mean Squared Error (MSE) across all folds: 180.74101137875493


---

# CLASSIFICATION TASK

- ### For classification task (predicting survival):
- ### Features (X) will be 'Pclass' and 'Fare'.
- ### Target variable (y) will be 'Survived'.


---

train_test_split with stratify


In [10]:
# For classification task (predicting survival):
# Features (X) will be 'Pclass' and 'Fare', and the target variable (y) will be 'Survived'.

# Read the file titanic.csv and load the data.

df = pd.read_csv("titanic.csv")

selected_features = ["pclass", "fare"]
X_classification = df[selected_features]
y_classification = df["survived"]

# Split the data into a training and testing set using train_test_split with stratification.

X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(
    X_classification,
    y_classification,
    test_size=0.2,
    random_state=42,
    stratify=y_classification,
)

# Create a Decision Tree Classifier model.

classification_model = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data.

classification_model.fit(X_train_class, y_train_class)

# Make predictions on the test data.

classification_predictions = classification_model.predict(X_test_class)

# Calculate accuracy for classification.

accuracy = accuracy_score(y_test_class, classification_predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6312849162011173


---

StratifiedKFold


In [11]:
# Now, perform cross-validation using Stratified K-Fold for classification.


# Create a StratifiedKFold object.

stratified_kfold = StratifiedKFold(n_splits=15, random_state=42, shuffle=True)

# Create a Decision Tree Classifier model.

model = DecisionTreeClassifier(random_state=42)

# Initialize a list to store cross-validation scores.

cv_scores = []

# Perform StratifiedKFold cross-validation.

i = 0
for train_index, test_index in stratified_kfold.split(
    X_classification, y_classification
):
    i += 1
    # print("Fold:", i)
    # print("train_index =", train_index)
    # print("test_index =", test_index)

    # Setup your train and test indices.

    X_train_cv, X_test_cv = (
        X_classification.iloc[train_index],
        X_classification.iloc[test_index],
    )
    y_train_cv, y_test_cv = (
        y_classification.iloc[train_index],
        y_classification.iloc[test_index],
    )

    # Reset the model.

    model = DecisionTreeClassifier(random_state=42)

    # Fit the model on the training data for each fold.

    model.fit(X_train_cv, y_train_cv)

    # Make predictions on the test data for each fold.

    predictions_cv = model.predict(X_test_cv)

    # Calculate the accuracy for each fold.

    accuracy_cv = accuracy_score(y_test_cv, predictions_cv)
    print("Fold", i, "accuracy =", accuracy_cv)
    cv_scores.append(accuracy_cv)

    # NOTE: Here you can check if the current accuracy is less than the previous one and resave the model.
    # Fold 5 and Fold 14 performed better than the previous cell with only one train test split.

# Calculate the mean cross-validation accuracy.

mean_cv_accuracy = sum(cv_scores) / len(cv_scores)
print(f"\nMean Cross-Validation Accuracy: {mean_cv_accuracy}")

Fold 1 accuracy = 0.6833333333333333
Fold 2 accuracy = 0.75
Fold 3 accuracy = 0.6833333333333333
Fold 4 accuracy = 0.75
Fold 5 accuracy = 0.6
Fold 6 accuracy = 0.7333333333333333
Fold 7 accuracy = 0.6610169491525424
Fold 8 accuracy = 0.7457627118644068
Fold 9 accuracy = 0.711864406779661
Fold 10 accuracy = 0.6779661016949152
Fold 11 accuracy = 0.6610169491525424
Fold 12 accuracy = 0.6949152542372882
Fold 13 accuracy = 0.6779661016949152
Fold 14 accuracy = 0.576271186440678
Fold 15 accuracy = 0.711864406779661

Mean Cross-Validation Accuracy: 0.687909604519774


---

<h1 align="center"> To Use The stratify Parameter Or Not To Use It? </h1>


---

# Not Using the Stratify Parameter


In [12]:
df = pd.read_csv("titanic.csv")

# For classification task (predicting survival):
# Features (X) will be 'Pclass', 'Fare', 'Age', 'Sibsp', 'Parch, and the target variable (y) will be 'Survived'.

selected_features = ["pclass", "fare", "age", "sibsp", "parch"]
X = df[selected_features]
y = df["survived"]

# Split the data into a single train-test split.

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create a Decision Tree Classifier model.

dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data.

dt_classifier.fit(X_train, y_train)

# Make predictions on the test data.

predictions = dt_classifier.predict(X_test)

# Calculate accuracy for the train-test split.

accuracy_test_split = accuracy_score(y_test, predictions)
print(f"Accuracy (Train-Test Split): {accuracy_test_split}")

# Now, let's use cross-validation to evaluate the model's performance
# 5-fold cross-validation.

cv_scores = cross_val_score(dt_classifier, X, y, cv=5, scoring="accuracy")

# Calculate the mean accuracy over all folds.

mean_accuracy_cv = cv_scores.mean()
print(f"Mean Accuracy (Cross-Validation): {mean_accuracy_cv}")

Accuracy (Train-Test Split): 0.7094972067039106
Mean Accuracy (Cross-Validation): 0.6600276191074006


---

# Using the Stratify Parameter


In [13]:
# For classification task (predicting survival):
# Features (X) will be 'Pclass', 'Fare', 'Age', 'Sibsp', 'Parch, and the target variable (y) will be 'Survived'.

selected_features = ["pclass", "fare", "age", "sibsp", "parch"]
X = df[selected_features]
y = df["survived"]

# Split the data into a single train-test split.

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create a Decision Tree Classifier model.

dt_classifier = DecisionTreeClassifier(random_state=42)

# Fit the model on the training data.

dt_classifier.fit(X_train, y_train)

# Make predictions on the test data.

predictions = dt_classifier.predict(X_test)

# Calculate accuracy for the train-test split.

accuracy_test_split = accuracy_score(y_test, predictions)
print(f"Accuracy (Train-Test Split): {accuracy_test_split}")

# Now, let's use cross-validation to evaluate the model's performance
# 5-fold cross-validation.

cv_scores = cross_val_score(dt_classifier, X, y, cv=5, scoring="accuracy")

# Calculate the mean accuracy over all folds.

mean_accuracy_cv = cv_scores.mean()
print(f"Mean Accuracy (Cross-Validation): {mean_accuracy_cv}")

Accuracy (Train-Test Split): 0.6536312849162011
Mean Accuracy (Cross-Validation): 0.6600276191074006
