# Titanic Survivors

### Imports

In [129]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

### Input

In [130]:
df=pd.read_csv("tested.csv")
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [131]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [132]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


### Working with features

In [134]:
sex={'male':0,"female":1}
df["Sex"]=df.Sex.map(sex)

In [135]:
df["Embarked"].value_counts()

Embarked
S    270
C    102
Q     46
Name: count, dtype: int64

In [136]:
embarked={'S':0,"C":1,"Q":2}
df["Embarked"]=df.Embarked.map(embarked)

In [137]:
# fill na's
df["Age"]=df["Age"].fillna(df["Age"].mean())
df["Fare"]=df["Fare"].fillna(df["Fare"].mean())
df = df.drop("Cabin", axis=1) #not a necessary column so drop it

In [138]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

### Features we will use

In [139]:
X=df[['Pclass', 'Sex', 'Age', 'SibSp',
       'Parch', 'Fare', 'Embarked']]
y=df["Survived"]

### model

In [140]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [141]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define models for binary classification
models = {
    'logistic_regression': LogisticRegression(
        penalty='l2',
        C=1.0,
        random_state=42
    ),
    'random_forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        random_state=42
    ),
    'svc': SVC(
        kernel='rbf',
        C=1.0,
        probability=True, 
        random_state=42
    )
}

# Train and evaluate
results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, 'predict_proba') else y_pred

    accuracy = accuracy_score(y_test, y_pred)

    print(f"{name} accuracy: {accuracy:.4f}")
    print(f"{name} confusion matrix:\n{confusion_matrix(y_test, y_pred)}\n")

    results[name] = {
        'model': model,
        'accuracy': accuracy
    }

Training logistic_regression...
logistic_regression accuracy: 1.0000
logistic_regression confusion matrix:
[[51  0]
 [ 0 33]]

Training random_forest...
random_forest accuracy: 1.0000
random_forest confusion matrix:
[[51  0]
 [ 0 33]]

Training svc...
svc accuracy: 0.9405
svc confusion matrix:
[[51  0]
 [ 5 28]]



### we will use random forest

In [144]:
y_pred = models['random_forest'].predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0

Confusion Matrix:
 [[51  0]
 [ 0 33]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        51
           1       1.00      1.00      1.00        33

    accuracy                           1.00        84
   macro avg       1.00      1.00      1.00        84
weighted avg       1.00      1.00      1.00        84





In [146]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': models['random_forest'].feature_importances_
}).sort_values(by='Importance', ascending=False)

print(feature_importance)

    Feature  Importance
1       Sex    0.837876
5      Fare    0.059039
2       Age    0.041750
4     Parch    0.020316
3     SibSp    0.016567
6  Embarked    0.014634
0    Pclass    0.009817


The results are great with random forest classifiers compared to the others....Sex is the most important feature here