In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read the datasets
df1 = pd.read_csv('train.csv')
df0 = pd.read_csv('test.csv')

# Preprocessing for training data
df = df1.drop(['PassengerId','Cabin', 'Name', 'Ticket', 'Embarked'], axis=1)
imputer = KNNImputer(n_neighbors=29)
df['Age'] = imputer.fit_transform(df[['Age']])
df['Sex'] = df['Sex'].replace({'male': 0, 'female': 1})
df = df.drop(['Age','SibSp', 'Parch'], axis=1)

X_train = df.drop(['Survived'], axis=1)
y_train = df['Survived']

In [3]:
# Preprocessing for test data
X_test = df0[['Pclass','Sex','Fare']]
X_test['Sex'] = X_test['Sex'].replace({'male': 0, 'female': 1})
mean_fare = X_test['Fare'].mean()
X_test['Fare'].fillna(mean_fare, inplace=True)

In [4]:
# Models
models = {
    'Logistic Regression': LogisticRegression(),
    'SVM Linear': SVC(kernel='linear'),      
    'SVM Polynomial': SVC(kernel='poly', degree=3), 
    'SVM RBF': SVC(kernel='rbf'), 
    'SVM Sigmoid': SVC(kernel='sigmoid'), 
    'Naive Bayes': GaussianNB(),
    'KNN': KNeighborsClassifier(n_neighbors=29),
    'Decision Tree': DecisionTreeClassifier(max_depth=3),
    'Random Forest': RandomForestClassifier(max_depth=3, n_estimators=100),
    'Bagging': BaggingClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}


In [5]:
# Accuracy table
accuracy_table = []

# Train and predict for each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_train)
    accuracy = accuracy_score(y_train, predictions) * 100
    accuracy = round(accuracy, 2)

    # Prediction for test data
    test_predictions = model.predict(X_test)
    test_results = pd.DataFrame({'Survived': test_predictions})
    PassengerId = list(range(892, 1310))
    test_results['PassengerId'] = PassengerId
    test_results = test_results[['PassengerId', 'Survived']]
    
    # Save predictions to CSV
    test_results.to_csv(f'predicted_{model_name.replace(" ", "")}.csv', index=False)

In [7]:
# Display accuracy table
accuracy_df = pd.DataFrame(accuracy_table, columns=['Model', 'Accuracy'])
accuracy_df

Unnamed: 0,Model,Accuracy
0,Logistic Regression,78.68
1,SVM Linear,78.68
2,SVM Polynomial,64.31
3,SVM RBF,67.45
4,SVM Sigmoid,56.45
5,Naive Bayes,77.78
6,KNN,76.77
7,Decision Tree,81.48
8,Random Forest,81.82
9,Bagging,89.67
