https://www.kaggle.com/code/mnassrib/titanic-logistic-regression-with-python

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn import preprocessing
import matplotlib.pyplot as plt

plt.rc("font", size=14)

In [3]:
import seaborn as sns
sns.set(style="white") #white background style for seaborn plots
sns.set(style="whitegrid", color_codes=True)

In [4]:
import warnings
warnings.simplefilter(action='ignore')

In [5]:
input_train_path = './train.csv'
input_test_path = './test.csv'

In [6]:
train_df = pd.read_csv(input_train_path)
test_df = pd.read_csv(input_test_path)

In [7]:
train_df.head()

In [8]:
print('The number of samples into the train data is {}.'.format(train_df.shape[0]))

In [9]:
test_df.head()

In [10]:
print('The number of samples into the test data is {}.'.format(test_df.shape[0]))

## data quality and missing data assessment

In [11]:
train_df.isnull().sum()

In [12]:
related_features_df = train_df[['Pclass', 'Age', 'SibSp', 'Parch']].copy()
related_features_df

In [13]:
train_df.describe()

In [14]:
# data correlative
related_features_df.corr()

### age data missing

In [15]:
# percent of missing "Age" 
print('Percent of missing "Age" records is %.2f%%' %((train_df['Age'].isnull().sum()/train_df.shape[0])*100))

In [16]:
ax = train_df["Age"].hist(bins=20, density=True, color='teal', alpha=0.6)
train_df["Age"].plot(kind='density', color='teal')
ax.set(xlabel='Age')
plt.xlim(0, 100)
plt.show()

In [17]:
# mean age
print('The mean of "Age" is %.2f' %(train_df["Age"].mean(skipna=True)))
# median age
print('The median of "Age" is %.2f' %(train_df["Age"].median(skipna=True)))

In [18]:
train_df[train_df['Age'].notna()].sample()['Age']

### cabin data missing

In [19]:
# percent of missing "Cabin" 
print('Percent of missing "Cabin" records is %.2f%%' %((train_df['Cabin'].isnull().sum()/train_df.shape[0])*100))

### embarked missing value

In [20]:
# percent of missing "Embarked" 
print('Percent of missing "Embarked" records is %.2f%%' %((train_df['Embarked'].isnull().sum()/train_df.shape[0])*100))

In [21]:
print('Boarded passengers grouped by port of embarkation (C = Cherbourg, Q = Queenstown, S = Southampton):')
print(train_df['Embarked'].value_counts())
sns.countplot(x='Embarked', data=train_df, palette='Set2')
plt.show()

In [22]:
print('The most common boarding port of embarkation is %s.' %train_df['Embarked'].value_counts().idxmax())

## adjustment of data

In [23]:
train_data = train_df.copy()
# train_data["Age"].fillna(train_df["Age"].median(skipna=True), inplace=True)
# TODO 可以尝试使用cell[18]的随机生成的方法填补空缺值
train_data['Age'].fillna(method='pad', axis=0, inplace=True)
train_data["Embarked"].fillna(train_df['Embarked'].value_counts().idxmax(), inplace=True)
train_data.drop('Cabin', axis=1, inplace=True)
train_data.head()

In [24]:
plt.figure(figsize=(15,8))
ax = train_df["Age"].hist(bins=15, density=True, stacked=True, color='teal', alpha=0.6)
train_df["Age"].plot(kind='density', color='teal')
ax = train_data["Age"].hist(bins=15, density=True, stacked=True, color='orange', alpha=0.5)
train_data["Age"].plot(kind='density', color='orange')
ax.legend(['Raw Age', 'Adjusted Age'])
ax.set(xlabel='Age')
plt.xlim(-10,85)
plt.show()

In [25]:
# create categorical variable for traveling alone
train_data['TravelAlone']=np.where((train_data["SibSp"]+train_data["Parch"])>0, 0, 1)
# train_data.drop('SibSp', axis=1, inplace=True)
# train_data.drop('Parch', axis=1, inplace=True)

In [26]:
# create categorical variables and drop some variables
training=pd.get_dummies(train_data, columns=["Pclass","Embarked","Sex"])
training.drop('Sex_female', axis=1, inplace=True)
training.drop('PassengerId', axis=1, inplace=True)
training.drop('Name', axis=1, inplace=True)
training.drop('Ticket', axis=1, inplace=True)

final_train = training
final_train.head()

## test data process

In [27]:
test_df.isnull().sum()

In [28]:
test_data = test_df.copy()
test_data["Age"].fillna(method='pad', axis=0, inplace=True)
test_data["Fare"].fillna(train_df["Fare"].median(skipna=True), inplace=True)
test_data.drop('Cabin', axis=1, inplace=True)

test_data['TravelAlone']=np.where((test_data["SibSp"]+test_data["Parch"])>0, 0, 1)

test_data.drop('SibSp', axis=1, inplace=True)
test_data.drop('Parch', axis=1, inplace=True)

testing = pd.get_dummies(test_data, columns=["Pclass","Embarked","Sex"])
testing.drop('Sex_female', axis=1, inplace=True)
testing.drop('PassengerId', axis=1, inplace=True)
testing.drop('Name', axis=1, inplace=True)
testing.drop('Ticket', axis=1, inplace=True)

final_test = testing
final_test.head()

In [29]:
final_test.isnull().sum()

## Exploratory Data Analysis

### Exploratory of Age

In [37]:
plt.figure(figsize=(15, 8))
ax = sns.kdeplot(final_train["Age"][final_train.Survived == 1], color="darkturquoise", shade=True)
sns.kdeplot(final_train["Age"][final_train.Survived == 0], color="lightcoral", shade=True)
plt.legend(['Survived', 'Died'])
plt.title('Density Plot of Age for Surviving Population and Deceased Population')
ax.set(xlabel = 'Age')
plt.xlim(-10,85)
plt.show()

In [38]:
plt.figure(figsize=(20,8))
avg_survival_byage = final_train[["Age", "Survived"]].groupby(['Age'], as_index=False).mean()
g = sns.barplot(x='Age', y='Survived', data=avg_survival_byage, color="LightSeaGreen")
plt.show()

In [39]:
final_train['IsChildren']=np.where(final_train['Age']<=16, 1, 0)
final_test['IsChildren']=np.where(final_test['Age']<=16, 1, 0)

## Logistic Regression

### feature selection

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

In [59]:
cols = ["Age","Fare","TravelAlone","Pclass_1","Pclass_2", "Pclass_3","Embarked_C","Embarked_Q", "Embarked_S","Sex_male","IsChildren"]
train_x = final_train[cols]
train_y = final_train['Survived']

In [60]:
LR = LogisticRegression()
LR.fit(train_x, train_y)

In [61]:
final_test

In [62]:
train_x

In [64]:
final_test['Survived'] = LR.predict(final_test)

In [65]:
final_test['PassengerId'] = test_df['PassengerId']

In [66]:
submission = final_test[['PassengerId','Survived']]
submission.to_csv("submission.csv", index=False)
submission.tail()