In [8]:
import pandas as pd
import numpy as np
import random as rnd


import seaborn as sns
import matplotlib.pyplot as plt



from sklearn.linear_model import LogisticRegression

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score



train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


full_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)




full_df['Title'] = full_df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


title_mapping = {
    "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs",
    "Lady": "Royalty", "Countess": "Royalty", "Dona": "Royalty",
    "Sir": "Royalty", "Don": "Royalty", "Jonkheer": "Royalty",
    "Capt": "Officer", "Col": "Officer", "Major": "Officer", "Rev": "Officer", "Dr": "Officer"
}
full_df['Title'] = full_df['Title'].replace(title_mapping)
rare_titles = full_df['Title'].value_counts()[full_df['Title'].value_counts() < 10].index
full_df['Title'] = full_df['Title'].replace(rare_titles, 'Rare')


full_df['FamilySize'] = full_df['SibSp'] + full_df['Parch'] + 1


full_df['IsAlone'] = 1  
full_df.loc[full_df['FamilySize'] > 1, 'IsAlone'] = 0  


full_df['Deck'] = full_df['Cabin'].str[0]
full_df['Deck'] = full_df['Deck'].fillna('U')  


full_df['FarePerPerson'] = full_df['Fare'] / full_df['FamilySize']


full_df['Age*Class'] = full_df['Age'] * full_df['Pclass']


full_df['Embarked'] = full_df['Embarked'].fillna(full_df['Embarked'].mode()[0])


full_df['Fare'] = full_df['Fare'].fillna(full_df['Fare'].median())


age_df = full_df[['Age', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Title']]

age_df = pd.get_dummies(age_df, columns=['Sex', 'Title'], drop_first=True)

known_age = age_df[age_df['Age'].notnull()]
unknown_age = age_df[age_df['Age'].isnull()]

X_train_age = known_age.drop('Age', axis=1)
y_train_age = known_age['Age']
X_test_age = unknown_age.drop('Age', axis=1)

rfr = RandomForestRegressor(n_estimators=1000, random_state=42)
rfr.fit(X_train_age, y_train_age)
predicted_ages = rfr.predict(X_test_age)

full_df.loc[full_df['Age'].isnull(), 'Age'] = predicted_ages

scaler = StandardScaler()
num_features = ['Age', 'Fare', 'FarePerPerson', 'FamilySize', 'Age*Class']
full_df[num_features] = scaler.fit_transform(full_df[num_features])


full_df = pd.get_dummies(full_df, columns=['Sex', 'Embarked', 'Title', 'Deck'], drop_first=True)


drop_columns = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp', 'Parch']
full_df.drop(columns=drop_columns, inplace=True)


train_df_processed = full_df.iloc[:len(train_df)].copy()
test_df_processed = full_df.iloc[len(train_df):].copy()

train_df_processed.reset_index(drop=True, inplace=True)
test_df_processed.reset_index(drop=True, inplace=True)


y_train = train_df['Survived'].reset_index(drop=True)


X_train = train_df_processed.drop('Survived', axis=1)


if 'Survived' in test_df_processed.columns:
    test_df_processed.drop('Survived', axis=1, inplace=True)

X_test = test_df_processed.copy()


print("NaN values in X_train:")
print(X_train.isnull().sum().sum())

print("Infinite values in X_train:")
print(np.isinf(X_train).sum().sum())


if X_train.isnull().sum().sum() > 0 or np.isinf(X_train).sum().sum() > 0:
    
    X_train.fillna(0, inplace=True)
    X_train.replace([np.inf, -np.inf], 0, inplace=True)
   

print("\nNaN values in X_test:")
print(X_test.isnull().sum().sum())

print("Infinite values in X_test:")
print(np.isinf(X_test).sum().sum())


if X_test.isnull().sum().sum() > 0 or np.isinf(X_test).sum().sum() > 0:
    X_test.fillna(0, inplace=True)
    X_test.replace([np.inf, -np.inf], 0, inplace=True)


models = []
models.append(('Logistic Regression', LogisticRegression(max_iter=1000)))
models.append(('Support Vector Machine', SVC(probability=True)))
models.append(('K-Nearest Neighbors', KNeighborsClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Random Forest', RandomForestClassifier(n_estimators=100)))
models.append(('Naive Bayes', GaussianNB()))

print("Model Evaluation:")
results = []
names = []
for name, model in models:
    kfold = StratifiedKFold(n_splits=5)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    print(f"{name}: Mean Accuracy = {cv_results.mean():.4f}, Std = {cv_results.std():.4f}")


best_model = RandomForestClassifier(n_estimators=100)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred.astype(int)
})


submission.to_csv('submission.csv', index=False)

import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold


models = []
models.append(('Logistic Regression', LogisticRegression(max_iter=1000)))
models.append(('Support Vector Machine', SVC(probability=True)))
models.append(('K-Nearest Neighbors', KNeighborsClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Random Forest', RandomForestClassifier(n_estimators=100)))
models.append(('Naive Bayes', GaussianNB()))
models.append(('Linear SVC', SVC(kernel='linear', probability=True)))
models.append(('Perceptron', Perceptron()))
models.append(('Stochastic Gradient Descent', SGDClassifier()))


results = []
names = []
mean_scores = []

print("Model Evaluation:")
for name, model in models:
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
    mean_score = cv_results.mean() * 100  # Convert to percentage
    std_score = cv_results.std() * 100
    mean_scores.append(mean_score)
    names.append(name)
    print(f"{name}: Mean Accuracy = {mean_score:.2f}%, Std = {std_score:.2f}%")


model_performance = pd.DataFrame({
    'Model': names,
    'Score': mean_scores
})

model_performance.sort_values(by='Score', ascending=False, inplace=True)
model_performance.reset_index(drop=True, inplace=True)


print("\nModel Performance:")
print(model_performance)


NaN values in X_train:
177
Infinite values in X_train:
0

NaN values in X_test:
87
Infinite values in X_test:
0
Model Evaluation:
Logistic Regression: Mean Accuracy = 0.8272, Std = 0.0119
Support Vector Machine: Mean Accuracy = 0.8316, Std = 0.0239
K-Nearest Neighbors: Mean Accuracy = 0.8070, Std = 0.0313
Decision Tree: Mean Accuracy = 0.7767, Std = 0.0318


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Random Forest: Mean Accuracy = 0.8193, Std = 0.0416
Naive Bayes: Mean Accuracy = 0.7654, Std = 0.0339
Model Evaluation:
Logistic Regression: Mean Accuracy = 82.60%, Std = 1.01%
Support Vector Machine: Mean Accuracy = 82.83%, Std = 1.62%
K-Nearest Neighbors: Mean Accuracy = 82.04%, Std = 2.18%
Decision Tree: Mean Accuracy = 78.56%, Std = 1.33%


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Random Forest: Mean Accuracy = 82.38%, Std = 1.00%
Naive Bayes: Mean Accuracy = 77.89%, Std = 1.18%
Linear SVC: Mean Accuracy = 82.94%, Std = 1.52%
Perceptron: Mean Accuracy = 77.44%, Std = 3.78%
Stochastic Gradient Descent: Mean Accuracy = 81.26%, Std = 1.83%

Model Performance:
                         Model      Score
0                   Linear SVC  82.941435
1       Support Vector Machine  82.827192
2          Logistic Regression  82.603729
3                Random Forest  82.378382
4          K-Nearest Neighbors  82.039420
5  Stochastic Gradient Descent  81.257297
6                Decision Tree  78.564434
7                  Naive Bayes  77.888394
8                   Perceptron  77.444605
