In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
original_titanic_train = pd.read_csv("titanic_train.csv")
original_titanic_train.head()

In [None]:
cleaned_titanic_train = original_titanic_train.copy()

In [None]:
cleaned_titanic_train.head()

In [None]:
cleaned_titanic_train.info()

In [None]:
cleaned_titanic_train[cleaned_titanic_train['Age'].isna()]

In [None]:
average_age = cleaned_titanic_train['Age'].mean()
cleaned_titanic_train['Age'] = cleaned_titanic_train['Age'].fillna(average_age)
cleaned_titanic_train['Age'].isna().sum()

In [None]:
cleaned_titanic_train[cleaned_titanic_train['Cabin'].isna()]

In [None]:
cleaned_titanic_train[cleaned_titanic_train['Embarked'].isna()]

In [None]:
cleaned_titanic_train["PassengerId"].duplicated().sum()

In [None]:
cleaned_titanic_train["Survived"].value_counts()

In [None]:
cleaned_titanic_train["Pclass"].value_counts()

In [None]:
cleaned_titanic_train["Sex"].value_counts()

In [None]:
cleaned_titanic_train["Embarked"].value_counts()

In [None]:
cleaned_titanic_train.describe()

In [None]:
cleaned_titanic_train['FamilyNum'] = cleaned_titanic_train['Parch'] + cleaned_titanic_train['SibSp'] + cleaned_titanic_train['Parch']
cleaned_titanic_train.head()

In [None]:
sns.set_palette("pastel")

In [None]:
plt.rcParams["figure.figsize"] = [7.00, 3.50]
plt.rcParams["figure.autolayout"] = True

In [None]:
survived_count = cleaned_titanic_train['Survived'].value_counts()
survived_label = survived_count.index
plt.pie(survived_count, labels = survived_label, autopct = '%.1f%%')
plt.show()

In [None]:
figure, axes = plt.subplots(1, 2)
sns.histplot(cleaned_titanic_train, x = 'Age', ax = axes[0])
sns.boxplot(cleaned_titanic_train, y = 'Age', ax = axes[1])
plt.show()

In [None]:
sns.histplot(cleaned_titanic_train, x = 'Age', hue = 'Survived', alpha = 0.4)
plt.show()

In [None]:
figure, axes = plt.subplots(1, 2, figsize = [15, 7])
sns.histplot(cleaned_titanic_train, x = 'Fare', ax = axes[0])
sns.boxplot(cleaned_titanic_train, y = 'Fare', ax = axes[1])
plt.show()

In [None]:
plt.subplots(1, 1, figsize = [12, 5])
sns.histplot(cleaned_titanic_train, x = 'Fare', hue = 'Survived', alpha = 0.4)
plt.show()

In [None]:
figure, axes = plt.subplots(1, 2)
pclass_count = cleaned_titanic_train['Pclass'].value_counts()
pclass_label = pclass_count.index
axes[0].pie(pclass_count, labels = pclass_label)
sns.countplot(cleaned_titanic_train, x = 'Pclass', hue = 'Survived', ax = axes[1])
plt.show()

In [None]:
figure, axes = plt.subplots(1, 2)
sex_count = cleaned_titanic_train['Sex'].value_counts()
sex_label = sex_count.index
axes[0].pie(sex_count, labels = sex_label)
sns.countplot(cleaned_titanic_train, x = 'Survived', hue = 'Sex', ax = axes[1])
plt.show()

In [None]:
figure, axes = plt.subplots(1, 2)
embarked_count = cleaned_titanic_train['Embarked'].value_counts()
embarked_label = embarked_count.index
axes[0].pie(embarked_count, labels = embarked_label)
sns.countplot(cleaned_titanic_train, x = 'Embarked', hue = 'Survived', ax = axes[1])
plt.show()

In [None]:
figure, axes = plt.subplots(1, 2)
familyNum_count = cleaned_titanic_train['FamilyNum'].value_counts()
familyNum_label = familyNum_count.index
axes[0].pie(familyNum_count, labels = familyNum_label)
sns.countplot(cleaned_titanic_train, x = 'FamilyNum', hue = 'Survived', ax = axes[1])
plt.show()

In [None]:
cleaned_titanic_train.info()

In [None]:
import statsmodels.api as sm

In [None]:
lr_titanic_train = cleaned_titanic_train.copy()
lr_titanic_train.head()

In [None]:
lr_titanic_train = lr_titanic_train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin', 'Embarked'], axis=1)
lr_titanic_train.head()

In [None]:
lr_titanic_train = pd.get_dummies(lr_titanic_train, drop_first = True, columns = ['Pclass', 'Sex'], dtype = int)
lr_titanic_train.head()

In [None]:
y = lr_titanic_train['Survived']

In [None]:
X = lr_titanic_train.drop(['Survived'], axis = 1)
X.corr()

In [None]:
X.corr().abs() > 0.8

In [None]:
X = X.drop(['Parch', 'SibSp'], axis = 1)

In [None]:
X = sm.add_constant(X)

In [None]:
model = sm.Logit(y, X).fit()
model.summary()

In [None]:
X = X.drop(['Fare'], axis = 1)

In [None]:
model = sm.Logit(y, X).fit()
model.summary()

In [None]:
# Age
np.exp(-0.0395)

In [None]:
# FamilyNum
np.exp(-0.2186)

In [None]:
# Pclass_2
np.exp(-1.1798)

In [None]:
# Pclass_3
np.exp(-2.3458)

In [None]:
# Sex_male
np.exp(-2.7854)

In [None]:
titanic_test = pd.read_csv("titanic_test.csv")
titanic_test.head()

In [None]:
titanic_test.info()

In [None]:
titanic_test['Age'] = titanic_test['Age'].fillna(titanic_test['Age'].mean())
titanic_test['Age'].isna().sum()

In [None]:
titanic_test['Pclass'] = pd.Categorical(titanic_test['Pclass'], categories=['1', '2', '3'])
titanic_test['Sex'] = pd.Categorical(titanic_test['Sex'], categories = ['female', 'male'])
titanic_test['Embarked'] = pd.Categorical(titanic_test['Embarked'], categories = ['C', 'Q', 'S'])

In [None]:
titanic_test = pd.get_dummies(titanic_test, drop_first = True, columns = ['Pclass', 'Sex'], dtype = int)
titanic_test.head()

In [None]:
model.params

In [None]:
titanic_test['FamilyNum'] = titanic_test['SibSp'] + titanic_test['Parch']
titanic_test.head()

In [None]:
X_test = titanic_test[['Age', 'FamilyNum', 'Pclass_2', 'Pclass_3', 'Sex_male']]
X_test = sm.add_constant(X_test)

In [None]:
predicted_value = model.predict(X_test)
predicted_value

In [None]:
predicted_value > 0.5