In [523]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

# New Section

In [551]:
train_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/lab_5/train.csv')
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [552]:
test_df = pd.read_csv('/content/drive/MyDrive/Colab_Notebooks/lab_5/test.csv')
test_df_copy = test_df.copy()

In [553]:
train_df.isnull().sum()

Unnamed: 0,0
PassengerId,0
Survived,0
Pclass,0
Name,0
Sex,0
Age,177
SibSp,0
Parch,0
Ticket,0
Fare,0


In [554]:
test_df_copy.isnull().sum()

Unnamed: 0,0
PassengerId,0
Pclass,0
Name,0
Sex,0
Age,86
SibSp,0
Parch,0
Ticket,0
Fare,1
Cabin,327


In [557]:
def age_category(row):
    if row < 12:
        return 'children'
    elif 12 <= row < 18:
        return 'teenage'
    elif 18 <= row < 60:
        return 'adult'
    elif row >= 60:
        return 'elder'
    else:
        return 'no age'

In [558]:
import re
def get_title(name):
    title_search = re.search(r' ([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)  # Return the title
    return ""

In [559]:
def fare_category(fare):
    if fare < 7.91:
        return 'low fare'
    elif 7.91 <= fare < 14.454:
        return 'medium fare'
    elif 14.454 <= fare < 31:
        return 'average fare'
    elif fare >= 31:
        return 'high fare'
    else:
        return 'unknown'

In [560]:
def preprocess_data(df):
  df.loc[:, 'Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
  df['Fare'] = df['Fare'].fillna(df['Fare'].median())
  df['Fare_Category'] = df['Fare'].apply(fare_category)
  df.drop(['Cabin'], axis=1)
  df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
  df['Ticket_Frequency'] = df.groupby('Ticket')['Ticket'].transform('count')
  df['Family_Grouped'] = pd.cut(df['FamilySize'],
                                  bins=[0, 1, 4, 6, 11],
                                  labels=['Alone', 'Small Family', 'Medium Family', 'Large Family'],
                                  include_lowest=True)
  df['Age'] = df['Age'].fillna(df['Age'].median())
  df['Age_Category'] = df['Age'].apply(age_category)
  df['Title'] = df['Name'].apply(get_title)
  df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don',
                                   'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
  df['Title'] = df['Title'].replace('Mlle', 'Miss')
  df['Title'] = df['Title'].replace('Ms', 'Miss')
  df['Title'] = df['Title'].replace('Mme', 'Mrs')

  return df

In [561]:
train = preprocess_data(train_df)
test = preprocess_data(test_df_copy)

In [562]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Fare_Category,FamilySize,Ticket_Frequency,Family_Grouped,Age_Category,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,low fare,2,1,Small Family,adult,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,high fare,2,1,Small Family,adult,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,medium fare,1,1,Alone,adult,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,high fare,2,2,Small Family,adult,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,medium fare,1,1,Alone,adult,Mr


In [573]:
train['FamilySize'].unique()

array([ 2,  1,  5,  3,  7,  6,  4,  8, 11])

In [575]:
def normalize_data(df):
  cont_features = df[['Ticket_Frequency', 'Fare']]
  X_cont = cont_features.values
  scaler = StandardScaler()
  X_cont_norm = scaler.fit_transform(X_cont)
  return X_cont_norm

In [576]:
def one_hot(df):
  # get categorical features
  cat_features = df[['Sex', 'Pclass', 'Embarked', 'Fare_Category', 'Family_Grouped', 'Age_Category', 'Title', 'FamilySize']]
  X_cat = cat_features.values
  # one-hot encode categorical features
  encoder = OneHotEncoder()
  X_cat_encoded = encoder.fit_transform(X_cat).toarray()
  return X_cat_encoded

In [577]:
def combine_data(X_cont_norm, X_cat_encoded):
  X = np.concatenate((X_cont_norm, X_cat_encoded), axis=1)
  return X

In [578]:
norm_train = normalize_data(train)
norm_test = normalize_data(test)

In [579]:
one_hot_train = one_hot(train)
one_hot_test = one_hot(test)

In [580]:
combine_train = combine_data(norm_train, one_hot_train)
combine_test = combine_data(norm_test, one_hot_test)

In [603]:
y = train_df['Survived'] 

X_train, X_val, y_train, y_val = train_test_split(combine_train, y, test_size=0.3, random_state=42)

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)


y_pred_log = log_model.predict(X_val)

accuracy_logis = accuracy_score(y_val, y_pred_log)
accuracy_logis

0.8171641791044776

In [604]:
test_y_pred_log = log_model.predict(combine_test)

In [605]:
rfc = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                             max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                             min_impurity_decrease=0.0, 
                             min_samples_leaf=1, min_samples_split=2,
                             min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
                             oob_score=False, random_state=None, verbose=0, warm_start=False)
rfc.fit(X_train, y_train)
y_pred_rfc = rfc.predict(X_val)

accuracy_rdf = accuracy_score(y_val, y_pred_rfc)
accuracy_rdf

0.7985074626865671

In [606]:
test_y_pred_rfc = rfc.predict(combine_test)

In [446]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(rfc, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Average CV score:", np.mean(cv_scores))

Cross-validation scores: [0.792      0.784      0.792      0.83064516 0.82258065]
Average CV score: 0.8042451612903226


In [447]:
from sklearn.svm import SVC

svc = SVC(probability=True)  # RBF kernel for non-linearity
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_val)

accuracy_score(y_val, y_pred_svc)

0.835820895522388

In [448]:
test_y_pred_svc = svc.predict(combine_test)

In [449]:
from sklearn.neighbors import KNeighborsClassifier
def find_best_k(X, y):

    best_score = -1
    best_k = 2  # Starting from 2 as K=1 can lead to overfitting

    # Loop through possible values for n_neighbors
    for k in range(2, 10):
        knn = KNeighborsClassifier(n_neighbors=k)

        # Perform cross-validation to evaluate the model
        scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy') 
        mean_score = scores.mean()  

        # Check if this is the best score
        if mean_score > best_score:
            best_score = mean_score
            best_k = k

    return best_k, best_score

best_k, best_score = find_best_k(X_train, y_train)
print(f"Best K: {best_k}, Best Score: {best_score}")

Best K: 5, Best Score: 0.8218709677419355


In [450]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
Y_pred = knn.predict(X_val)
acc_knn = round(knn.score(X_train, y_train) * 100, 2)
acc_knn

87.0

In [451]:
test_y_pred_knn = knn.predict(combine_test)

In [452]:
acc_knn_val = round(knn.score(X_val, y_val) * 100, 2)
print(f"Validation accuracy with K=4: {acc_knn_val}")

knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train, y_train)
acc_knn_best_val = round(knn_best.score(X_val, y_val) * 100, 2)
print(f"Validation accuracy with K=8 (best K): {acc_knn_best_val}")

Validation accuracy with K=4: 81.34
Validation accuracy with K=8 (best K): 81.34


In [457]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred_gbc = gbc.predict(X_val)
acc_gbc = round(gbc.score(X_val, y_val) * 100, 2)
print(f"Gradient Boosting Accuracy: {acc_gbc}")

Gradient Boosting Accuracy: 81.72


In [458]:
test_y_pred_gbc = gbc.predict(combine_test)

In [459]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_val)
acc_nb = round(nb.score(X_val, y_val) * 100, 2)
print(f"Naive Bayes Accuracy: {acc_nb}")

Naive Bayes Accuracy: 81.34


In [460]:
test_y_pred_nb = nb.predict(combine_test)

In [461]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
y_pred_ada = ada.predict(X_val)
acc_ada = round(ada.score(X_val, y_val) * 100, 2)
print(f"AdaBoost Accuracy: {acc_ada}")



AdaBoost Accuracy: 81.72


In [462]:
test_y_pred_ada = ada.predict(combine_test)

In [463]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[
    ('lr', log_model),
    ('rf', rfc),
    ('sv', svc),
    ('knn', knn_best),
    ('gbc', gbc),
    ('nb', nb),
    ('ada', ada)
], voting='soft')

# Train the Voting Classifier
voting_clf.fit(X_train, y_train)

# Predict and evaluate accuracy on validation set
y_pred = voting_clf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"Ensemble Model Accuracy: {accuracy:.4f}")

Ensemble Model Accuracy: 0.8172




In [465]:
test_y_pred_voting = voting_clf.predict(combine_test)

In [592]:
submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],  
    'Survived':test_y_pred_rfc
})

submission.to_csv('titanic_submission.csv', index=False)

from google.colab import files
files.download('titanic_submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>