# 0. Import

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier

# 1. Loading Data

In [2]:
# Loading training and test data
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_ids = test['PassengerId']

# 2. Data Preprocessing


## 2.1 Data Cleaning


In [3]:
# Function to clean the data by dropping unnecessary columns and filling missing values
def clean(data):
  data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1)

  data['Embarked'].fillna('U', inplace = True)

  columns1 = ['SibSp', 'Parch', 'Fare', 'Age']
  for i in columns1:
    data[i].fillna(data[i].median(), inplace = True)

  return data

data = clean(data)
test = clean(test)

## 2.2 Data Encoding

In [4]:
# Encoding categorical features
le = LabelEncoder()

columns2 = ['Sex', 'Embarked']
for i in columns2:
  data[i] = le.fit_transform(data[i])
  test[i] = le.transform(test[i])

# 3. Model Building

## 3.1 Training models with default paramets

In [5]:
# Splitting the data into training and testing sets
X = data.drop('Survived', axis = 1)
y = data['Survived']

In [6]:
# Defining models
models = {
    'SVC': SVC(random_state = 37),
    'KNC': KNeighborsClassifier(),
    'MNB': MultinomialNB(),
    'LogR': LogisticRegression(random_state = 37, max_iter = 1000),
    'DTC': DecisionTreeClassifier(random_state = 37),
    'RFC': RandomForestClassifier(random_state = 37),
    'GBC': GradientBoostingClassifier(random_state = 37),
    'BC': BaggingClassifier(random_state = 37),
    'AB': AdaBoostClassifier(random_state = 37)
    }

In [7]:
# Finding best models with default parameters
results = {}

def train_model(clf, X, y):
  scores = cross_val_score(clf, X, y, cv=5)
  return scores

for model_name, model in models.items():
  score = train_model(model, X, y)
  results[model_name] = score.mean()

RS = pd.DataFrame(list(results.items()), columns=['Model', 'Score']).sort_values('Score', ascending = False)
RS

Unnamed: 0,Model,Score
6,GBC,0.821568
8,AB,0.8115
7,BC,0.811462
5,RFC,0.810345
3,LogR,0.789003
4,DTC,0.775582
1,KNC,0.693641
2,MNB,0.688099
0,SVC,0.674616


## 3.2 Training 3 best models

In [8]:
# Choosing 3 best models and parameter for them
models = {
    'GBC': GradientBoostingClassifier(random_state = 37),
    'AB': AdaBoostClassifier(random_state = 37),
    'BC': BaggingClassifier(random_state = 37)
    }

params_grid = {
    'GBC': {
        'n_estimators': [75, 100, 125, 150, 175, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [1, 3, 5, 7, 9, 11]
        },
    'AB': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 1],
        },
    'BC': {
        'n_estimators': [10, 50, 100, 150, 200],
        'max_samples': [0.25, 0.5, 0.75, 1.0],
        'max_features': [0.25, 0.5, 0.75, 1.0]
        }
    }

In [9]:
# Training models
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=37)

best_params = {}
best_estimators = {}
best_scores = {}
accuracys = {}

for model_name, model in models.items():
  clf_grid = GridSearchCV(estimator=model, param_grid=params_grid[model_name], cv=5, n_jobs=-1)
  clf_grid.fit(X_train, y_train)
  best_estimator = clf_grid.best_estimator_
  y_pred = best_estimator.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  best_params[model_name] = clf_grid.best_params_
  accuracys[model_name] = accuracy

In [10]:
'Params: ', best_params

('Params: ',
 {'GBC': {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 175},
  'AB': {'learning_rate': 0.1, 'n_estimators': 100},
  'BC': {'max_features': 0.75, 'max_samples': 0.25, 'n_estimators': 50}})

In [11]:
'Accuracy: ', accuracys

('Accuracy: ',
 {'GBC': 0.8156424581005587,
  'AB': 0.8156424581005587,
  'BC': 0.8156424581005587})

# 4. Submission Prediction

In [12]:
# So all three models gave the same result, so we choose any model
clf_params = best_params['GBC']
clf = GradientBoostingClassifier(**clf_params).fit(X, y)
submisson_pred = clf.predict(test)

df = pd.DataFrame({'PassengerId': test_ids.values,
                   'Survived': submisson_pred})

In [13]:
# Creating submission CSV file
df.to_csv('gender_submission.csv', index = False)