In [None]:
# Import Google Colab drive module to access Google Drive
from google.colab import drive
# Mount Google Drive to the Colab environment at '/content/drive'
drive.mount('/content/drive')

# Import pandas library for data manipulation
import pandas as pd

# Define the file path to the Titanic dataset CSV file in Google Drive
path = "/content/drive/MyDrive/titanic_dataset/Titanic-Dataset.csv"
# Load the dataset into a pandas DataFrame
df = pd.read_csv(path)
# Display the first 5 rows of the DataFrame to inspect the data
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
# Define the list of feature columns to use for prediction
features = ["Pclass", "Sex", "Age", "Fare", "Embarked"]
# Define the target column (what we want to predict)
target = "Survived"

# Select only the specified features and target column from the DataFrame
df = df[features + [target]]

In [None]:
# Fill missing values in the 'Age' column with the median age
df["Age"].fillna(df["Age"].median(), inplace=True)
# Fill missing values in the 'Embarked' column with the most frequent (mode) value
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)


In [None]:
# Import LabelEncoder from sklearn for converting categorical variables to numerical
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance for the 'Sex' column
le_sex = LabelEncoder()
# Create a LabelEncoder instance for the 'Embarked' column
le_embarked = LabelEncoder()

# Fit the encoder on 'Sex' data and transform it to numerical values
df["Sex"] = le_sex.fit_transform(df["Sex"])
# Fit the encoder on 'Embarked' data and transform it to numerical values
df["Embarked"] = le_embarked.fit_transform(df["Embarked"])

In [None]:
# Import StandardScaler from sklearn for feature scaling
from sklearn.preprocessing import StandardScaler

# Extract the feature columns into X (input variables)
X = df[features]
# Extract the target column into y (output variable)
y = df[target]

# Create a StandardScaler instance
scaler = StandardScaler()
# Fit the scaler on the features and transform them to standardized values
X_scaled = scaler.fit_transform(X)

In [None]:
# Import train_test_split from sklearn for splitting data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the scaled features and target into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(

    X_scaled, y, test_size=0.2, random_state=42

)

In [None]:
# Import various machine learning models from sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Create a dictionary of model names and their instances
models = {

    "Logistic Regression": LogisticRegression(),

    "Random Forest": RandomForestClassifier(),

    "SVM": SVC(),

    "KNN": KNeighborsClassifier()

}

# Loop through each model and train it on the training data
for name, model in models.items():

    model.fit(X_train, y_train)

In [None]:
# Import classification_report from sklearn for evaluating model performance
from sklearn.metrics import classification_report

# Loop through each trained model
for name, model in models.items():

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Print the model name
    print(f"\n{name}")

    # Print the classification report showing precision, recall, f1-score, etc.
    print(classification_report(y_test, y_pred))


Logistic Regression
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       105
           1       0.76      0.74      0.75        74

    accuracy                           0.80       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179


Random Forest
              precision    recall  f1-score   support

           0       0.85      0.84      0.84       105
           1       0.77      0.78      0.78        74

    accuracy                           0.82       179
   macro avg       0.81      0.81      0.81       179
weighted avg       0.82      0.82      0.82       179


SVM
              precision    recall  f1-score   support

           0       0.79      0.90      0.84       105
           1       0.83      0.66      0.74        74

    accuracy                           0.80       179
   macro avg       0.81      0.78      0.79       179
weighted avg       0.81      0.80

In [None]:
# Import joblib for saving and loading models
import joblib

# Save the trained model (last one in the loop) to a pickle file
joblib.dump(model, "titanic_survival_model.pkl")
# Save the fitted scaler to a pickle file
joblib.dump(scaler, "scaler.pkl")
# Save the fitted sex label encoder to a pickle file
joblib.dump(le_sex, "le_sex.pkl")
# Save the fitted embarked label encoder to a pickle file
joblib.dump(le_embarked, "le_embarked.pkl")

['le_embarked.pkl']

In [None]:
# Load the saved model from the pickle file
loaded_model = joblib.load("titanic_survival_model.pkl")
# Load the saved scaler from the pickle file
loaded_scaler = joblib.load("scaler.pkl")

# Create a sample input data point (Pclass=3, Sex=1(Male), Age=25, Fare=7.25, Embarked=2(S))
sample = [[3, 1, 25, 7.25, 2]]
# Scale the sample data using the loaded scaler
sample_scaled = loaded_scaler.transform(sample)
# Make a prediction on the scaled sample data
loaded_model.predict(sample_scaled)



array([0])