# Modeling

Goal:
* Create models and test them

In [9]:
import pandas as pd
import numpy as np

import joblib
import sys
import os

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score

In [10]:
from google.colab import drive

drive.mount('/content/drive')
project_path = "/content/drive/MyDrive/Pytorch pet projects/ML - Projecs/ML - Titanic - Machine Learning from Disaster"
sys.path.append(os.path.join(project_path, "src"))

clean_df = pd.read_csv("/content/drive/MyDrive/Pytorch pet projects/ML - Projects/ML - Titanic - Machine Learning from Disaster/data/processed/processed_data.csv")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Split data to train and test data

In [11]:
def encode_sex_column(df):
  df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
  return df

X = clean_df.drop(columns=["Survived"])
y = clean_df["Survived"]

pipeline_loaded = joblib.load("/content/drive/MyDrive/Pytorch pet projects/ML - Projects/ML - Titanic - Machine Learning from Disaster/src/preprocessing/preprocessing_pipeline.pkl")
X_transformed = pipeline_loaded.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

## Decision tree

In [12]:
# 🔹 4️⃣ Döntési fa modell betanítása
model = DecisionTreeClassifier(max_depth=30, random_state=42)
model.fit(X_train, y_train)

# 🔹 5️⃣ Modell elmentése
joblib.dump(model, "decision_tree.pkl")

# 🔹 6️⃣ Modell kiértékelése
train_acc = model.score(X_train, y_train)
test_acc = model.score(X_test, y_test)
print(f"Train Accuracy: {train_acc:.2f}, Test Accuracy: {test_acc:.2f}")

Train Accuracy: 0.98, Test Accuracy: 0.74


## Random Forest

In [13]:
rf = RandomForestClassifier(n_estimators = 800)
rf.fit(X_train, y_train)
pred = rf.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f"Accuracy: {acc:.4f}")


Accuracy: 0.7654


## Cross val score - RF

In [14]:
cv_scores = cross_val_score(rf, X_train, y_train, cv=10)
# Print accuracy for each fold
print("Cross-validation scores:", cv_scores)

# Print mean accuracy
print("Mean accuracy:", cv_scores.mean())

Cross-validation scores: [0.80555556 0.68055556 0.70422535 0.84507042 0.83098592 0.73239437
 0.74647887 0.77464789 0.76056338 0.84507042]
Mean accuracy: 0.7725547730829421


## Bagging - RF

In [15]:
model = DecisionTreeClassifier(max_depth=30 , random_state=42)
bagging = BaggingClassifier(estimator=rf, n_estimators=100, n_jobs=-1)

bagging.fit(X_train, y_train)
pr = bagging.predict(X_test)
acc = accuracy_score(y_test, pr)
print(f"Accuracy: {acc:.4f}")

Accuracy: 0.7542


## Gradient Boosting

In [16]:
gbt = GradientBoostingClassifier(n_estimators=300,
                                 max_depth=40,
                                 learning_rate=0.5,
                                 subsample=0.8,
                                 max_features=0.2)
gbt.fit(X_train, y_train)
y_pred = gbt.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Accuracy: {acc:.4f}")

Accuracy: 0.7318
