In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_data = pd.read_csv(r'.\data\train.csv')
test_data = pd.read_csv(r'.\data\test.csv')

train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
train_data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
dataframe = train_data.copy()
dataframe = dataframe.drop(columns=['Name', 'Cabin', 'Ticket'])
dataframe = dataframe.dropna()

test_dataframe = test_data.copy()
test_dataframe = test_dataframe.drop(columns=['Name', 'Cabin', 'Ticket'])
test_dataframe = test_dataframe.dropna()

In [5]:
dataframe.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       0
dtype: int64

In [6]:
#dataframe.head()
test_dataframe.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
sex_encoded = label_encoder.fit_transform(dataframe['Sex'])
embarked_encoded = label_encoder.fit_transform(dataframe['Embarked'])
test_sex_encoded = label_encoder.fit_transform(test_dataframe['Sex'])
test_embarked_encoded = label_encoder.fit_transform(test_dataframe['Embarked'])

dataframe['Sex'] = sex_encoded
dataframe['Embarked'] = embarked_encoded
test_dataframe['Sex'] = test_sex_encoded
test_dataframe['Embarked'] = test_embarked_encoded

dataframe.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,2
1,2,1,1,0,38.0,1,0,71.2833,0
2,3,1,3,0,26.0,0,0,7.925,2
3,4,1,1,0,35.0,1,0,53.1,2
4,5,0,3,1,35.0,0,0,8.05,2


In [8]:
X = dataframe[[col for col in dataframe.columns if col != 'Survived']]
y = dataframe['Survived']

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    random_state = 0,
    solver = 'lbfgs',
    max_iter = 200,
    verbose = 0,
)
model.fit(X, y)
model.score(X, y) #accuracy score - 80%
prediction_from_logistic = model.predict(test_dataframe)

In [10]:
from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier(random_state=0)
model_tree.fit(X, y)
prediction_from_tree_model = model_tree.predict(test_dataframe)

In [11]:
result_dataframe = pd.DataFrame(data = {
                                        'Logistic_Regression': prediction_from_logistic,
                                        'Decision_Tree': prediction_from_tree_model,
                                        })
result_dataframe.to_csv("Result.csv")
result_dataframe.head()

Unnamed: 0,Logistic_Regression,Decision_Tree
0,0,0
1,0,0
2,0,0
3,0,1
4,1,0


In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
#from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_score
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False, random_state=0)

#Logistic Regression Validation
model = LogisticRegression(
    random_state = 0,
    solver = 'lbfgs',
    max_iter = 500,
    verbose = 0,
)
model.fit(X_train, y_train)
result_log = model.predict(X_test)

print(f"Logistic Regression model accuracy score: {accuracy_score(y_test, result_log)}")
print(f"Logistic Regression model precision score: {precision_score(y_test, result_log, average='macro')}")
confusion_matrix(y_test, result_log)

Logistic Regression model accuracy score: 0.8146067415730337
Logistic Regression model precision score: 0.8057734344863058


array([[88, 20],
       [13, 57]], dtype=int64)

In [14]:
#Decision Tree Validation
model_tree = DecisionTreeClassifier(random_state=0, max_depth=15)
model_tree.fit(X_train, y_train)
result_tree = model_tree.predict(X_test)

print(f"Decision tree model accuracy score: {accuracy_score(y_test, result_tree)}")
print(f"Decision tree model precision score: {precision_score(y_test, result_tree, average='macro')}")
confusion_matrix(y_test, result_tree)

Decision tree model accuracy score: 0.7921348314606742
Decision tree model precision score: 0.784002722940776


array([[92, 16],
       [21, 49]], dtype=int64)

In [16]:
result = model_tree.predict(test_dataframe)

In [25]:
result_df = pd.DataFrame(data = {
    'PassengerId': test_dataframe['PassengerId'],
    'Survived': result,
})
result_df.to_csv('result.csv', index=False)