In [153]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [154]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
print(train.isnull().sum(),test.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64 PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [155]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [156]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


#### EDA

In [157]:
def feature_engineering(df):
    df_copy = df.copy()

    df_copy['Age'].fillna(df_copy['Age'].median(), inplace=True)

    df_copy['Fare'].fillna(df_copy.groupby('Pclass')['Fare'].transform('median'), inplace=True)

    df_copy['Embarked'].fillna(df_copy['Embarked'].mode()[0], inplace=True)

    df_copy['Title'] = df_copy['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

    title_mapping = {
        'Mr': 'Mr', 'Miss': 'Miss', 'Mrs': 'Mrs', 'Master': 'Master',
        'Dr': 'Rare', 'Rev': 'Rare', 'Col': 'Rare', 'Major': 'Rare',
        'Mlle': 'Miss', 'Ms': 'Miss', 'Lady': 'Rare', 'Countess': 'Rare',
        'Don': 'Rare', 'Dona': 'Rare', 'Mme': 'Mrs', 'Capt': 'Rare',
        'Sir': 'Rare', 'Jonkheer': 'Rare'
    }
    df_copy['Title'] = df_copy['Title'].map(title_mapping)

    df_copy['AgeGroup'] = pd.cut(df_copy['Age'], bins=[0, 12, 18, 35, 60, 100],
                                labels=['Child', 'Teenager', 'Adult', 'Middle', 'Senior'])

    df_copy['FareGroup'] = pd.qcut(df_copy['Fare'], 4, labels=['Low', 'Medium', 'High', 'VeryHigh'])

    return df_copy

In [158]:
train_processed = feature_engineering(train)
test_processed = feature_engineering(test)

In [159]:
def preprocess_data(df):
    df_copy = df.copy()

    columns_to_drop = ['Name', 'Ticket', 'Cabin']
    df_copy = df_copy.drop([col for col in columns_to_drop if col in df_copy.columns], axis=1)

    categorical_cols = ['Sex', 'Embarked', 'Title', 'AgeGroup', 'FareGroup']
    label_encoders = {}

    for col in categorical_cols:
        if col in df_copy.columns:
            le = LabelEncoder()
            df_copy[col] = le.fit_transform(df_copy[col].astype(str))

    return df_copy

In [160]:
train_final = preprocess_data(train_processed)
test_final = preprocess_data(test_processed)

In [161]:
train_final.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,AgeGroup,FareGroup
0,1,0,3,1,22.0,1,0,7.25,2,2,0,1
1,2,1,1,0,38.0,1,0,71.2833,0,3,2,3
2,3,1,3,0,26.0,0,0,7.925,2,1,0,2
3,4,1,1,0,35.0,1,0,53.1,2,3,0,3
4,5,0,3,1,35.0,0,0,8.05,2,2,0,2


In [162]:
test_final.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,AgeGroup,FareGroup
0,892,3,1,34.5,0,0,7.8292,1,2,0,1
1,893,3,0,47.0,1,0,7.0,2,3,2,1
2,894,2,1,62.0,0,0,9.6875,1,2,3,2
3,895,3,1,27.0,0,0,8.6625,2,2,0,2
4,896,3,0,22.0,1,1,12.2875,2,3,0,2


In [163]:
train_final = train_final.drop('Survived', axis=1)
target = train['Survived']

In [164]:
std = StandardScaler()
train_df = std.fit_transform(train_final)
test_df = std.transform(test_final)

#### Models

In [165]:
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(random_state=42),
    'SVM': SVC(random_state=42)
}

In [166]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [167]:
model_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, train_df, target, cv=cv, scoring='accuracy')
    model_scores[name] = scores.mean()
    print(f"{name}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

RandomForest: 0.8384 (+/- 0.0376)
LogisticRegression: 0.7957 (+/- 0.0241)
SVM: 0.8339 (+/- 0.0167)


In [168]:
best_model = max(model_scores, key=model_scores.get)
print(f"best model name: {best_model}")

best model name: RandomForest


#### Random Forest Classification

In [169]:
rf = RandomForestClassifier(n_estimators=100,random_state=42)

In [170]:
rf.fit(train_df, target)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [171]:
test_predictions = rf.predict(test_df)

#### Submission

In [172]:
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived': test_predictions
})

submission.to_csv('submission_optimized.csv', index=False)
print(submission.head())

   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         0
