In [11]:
#In this Jupyter Notebook, we will be working on the Titanic dataset from the Kaggle competition. The goal of this competition is to create a machine learning model that can predict passenger survival based on the given dataset. We will be using different algorithms like DecisionTreeClassifier, RandomForestClassifier, KNeighborsClassifier, GaussianNB, and GradientBoostingClassifier, and evaluating their performance using the accuracy score.

#The dataset can be found at: https://gist.githubusercontent.com/DariaAlekseeva/299611a0daa6008685f7/raw/b431eb200bcafd2eba68a11ccbc80d051d3eeba9/titanic.csv

Here is an overview of the code:

In [9]:
#For Data 
import pandas as pd
import numpy as np

#For Data Encoding
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

#For feature scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

#for data spliting for evaluation
from sklearn.model_selection import train_test_split

#for model selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

#for model pevulation
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix


In [10]:
#load dataset

df1 = pd.read_csv('train.csv')
df2 = pd.read_csv('test.csv')

In [7]:
#Data preprocessing 
def preprocess_data(data):
    data['Age'].fillna(data['Age'].median(), inplace=True)
    data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
    data['Fare'].fillna(data['Fare'].median(), inplace=True)
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    data['IsAlone'] = 1
    data['IsAlone'].loc[data['FamilySize'] > 1] = 0
    data.drop(['Cabin', 'Ticket', 'Name', 'PassengerId','SibSp',  'Parch'], axis=1, inplace=True)
    return data


   
    


In [8]:
#fixed all the empty columns 
df1 = preprocess_data(df1) 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['IsAlone'].loc[data['FamilySize'] > 1] = 0


In [9]:
#encoding the categorical features

encoder = LabelEncoder()
encoder.fit(df1['Sex'])
df1['Sex'] = encoder.transform(df1['Sex'])
df1['Embarked'] = encoder.fit_transform(df1['Embarked'])

In [109]:
#scaleing up the features

scaler = StandardScaler()
df1['Age'] = scaler.fit_transform(np.array(df1['Age']).reshape(-1, 1))
df1['Fare'] = scaler.fit_transform(np.array(df1['Fare']).reshape(-1, 1))

In [113]:
#df1.drop(['age_scaled'],axis=1,inplace=True)

In [10]:
# Spletting the dataset
X = df1.drop('Survived', axis=1)
y = df1['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Model traning 

#- DecisionTreeClassifier

dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)

y_pred = dt_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Decision Tree Accuracy:", accuracy)


Decision Tree Accuracy: 0.776536312849162


In [12]:
#- RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=42)
parameters = {
    'n_estimators': [100, 200, 500],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(estimator=rf_classifier, param_grid=parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_

rf_classifier = RandomForestClassifier(**best_params)
rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8044692737430168


In [14]:
#- Support Vector Machine

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Support Vector Machine Model
svc_classifier = SVC(random_state=42)
svc_classifier.fit(X_train_scaled, y_train)

# Model Evaluation
y_pred = svc_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("SVC Accuracy:", accuracy)



: 

: 

In [13]:
#-KNeighborsClassifier

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train_scaled, y_train)

y_pred = knn_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("K-Nearest Neighbors Accuracy:", accuracy)




K-Nearest Neighbors Accuracy: 0.7988826815642458


In [14]:
# Naive Bayes Model
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)


y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Naive Bayes Accuracy:", accuracy)


Naive Bayes Accuracy: 0.7821229050279329


In [15]:
#- Gradient Boosting Model
gb_classifier = GradientBoostingClassifier(random_state=42)
gb_classifier.fit(X_train, y_train)

y_pred = gb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Gradient Boosting Classifier Accuracy:", accuracy)


Gradient Boosting Classifier Accuracy: 0.8044692737430168
