In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
training_file = pd.read_csv('filepath/train.csv')

df = pd.DataFrame(training_file)
df.drop(['Name',
         'Embarked',
         'Fare',
         'Ticket',
         'Cabin',
         'Age'], axis = 'columns', inplace=True)

In [None]:
encoder = LabelEncoder()
df['sex_encoded'] = encoder.fit_transform(df['Sex'])
df.drop(['Sex'], axis='columns', inplace=True)
data = df.drop(['PassengerId', 'Survived'], axis='columns')
target = df['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3, random_state=42)

In [None]:
model_params = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'Decision Tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'max_depth': [10, 20, None],
            'min_samples_split': [2, 10, 20]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 30, 50, 100],
            'max_depth': [10, 20, None],
            'criterion' : ['entropy', 'gini', 'log_loss'],
            'min_samples_split' : [2, 3, 5]
        }
    },
    'Gradient Boosting': {
        'model': GradientBoostingClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2]
        }
    },
    'AdaBoost': {
        'model': AdaBoostClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(),
        'params': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2]
        }
    },
    'K-Nearest Neighbors': {
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance']
        }
    },
    'SVM': {
        'model': SVC(probability=True),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    },
    'Naive Bayes': {
        'model': GaussianNB(),
        'params': {}
    },
}

In [None]:
def model_screening(x_set, y_set):
  iterator = 1
  scores = []
  for model_name, mp in model_params.items():
      clf = GridSearchCV(mp['model'], mp['params'], cv=3, return_train_score=False)
      clf.fit(x_set, y_set)


      # Predicition of probabilities
      y_scores = clf.predict_proba(x_set)[:, 1]

      # AUC calculation
      auc_score = roc_auc_score(y_set, y_scores)

      scores.append({
          'model': model_name,
          'best_score': clf.best_score_,
          'best_params': clf.best_params_,
          'auc_score': auc_score
      })
      iterator += 1

  # Results printing
  for score in scores:
      print(f"Model: {score['model']}, Best Score : {score['best_score']:.2f}, Best Params: {score['best_params']}, AUC: {score['auc_score']:.2f}")


In [None]:
print('Training set:')
model_screening(X_train, y_train)
print()
print('Validation set:')
model_screening(X_test, y_test)


Training set:
Model: Logistic Regression, Best Score : 0.79, Best Params: {'C': 10, 'solver': 'liblinear'}, AUC: 0.82
Model: Decision Tree, Best Score : 0.78, Best Params: {'max_depth': 10, 'min_samples_split': 2}, AUC: 0.88
Model: Random Forest, Best Score : 0.78, Best Params: {'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 10}, AUC: 0.87
Model: Gradient Boosting, Best Score : 0.79, Best Params: {'learning_rate': 0.01, 'n_estimators': 50}, AUC: 0.84
Model: AdaBoost, Best Score : 0.80, Best Params: {'learning_rate': 0.1, 'n_estimators': 100}, AUC: 0.84
Model: XGBoost, Best Score : 0.79, Best Params: {'learning_rate': 0.01, 'n_estimators': 50}, AUC: 0.85
Model: K-Nearest Neighbors, Best Score : 0.77, Best Params: {'n_neighbors': 5, 'weights': 'uniform'}, AUC: 0.83
Model: SVM, Best Score : 0.80, Best Params: {'C': 0.1, 'kernel': 'rbf'}, AUC: 0.84
Model: Naive Bayes, Best Score : 0.76, Best Params: {}, AUC: 0.83

Validation set:
Model: Logistic Regression, B

In [None]:
testing_file = pd.read_csv('filepath/test.csv')
df_task = pd.DataFrame(testing_file)
df_task.drop(['Name',
         'Embarked',
         'Fare',
         'Ticket',
         'Cabin',
         'Age'], axis = 'columns', inplace=True)

df_task['sex_encoded'] = encoder.fit_transform(df_task['Sex'])
df_task.drop(['Sex'], axis='columns', inplace=True)
data_task = df_task.drop(['PassengerId'], axis='columns')

In [None]:
model = RandomForestClassifier(n_estimators=10,
                               criterion='entropy',
                               min_samples_split=5,
                               max_depth=10
                               )


model.fit(data, target)
model.predict(data_task)

array([0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [None]:
print(len(model.predict(data_task)))

418


In [None]:
df_final = pd.DataFrame(data=model.predict(data_task),
                        columns=['Survived']
)
df_final['PassengerID'] = np.arange(892,1310)
cols = df_final.columns.to_list()
df_final = df_final[cols[::-1]]
path = 'path/submision.csv'
df_final.to_csv(path, index=False)
