In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns 
import matplotlib.pyplot as plt



# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_set =  '/kaggle/input/titanic/train.csv'

df = pd.read_csv(train_set)
df.head()

In [None]:
df.describe()

In [None]:
df.info()

The first thing one can see with a first look at the dataset is the presence of missing values in the following columns : *Cabin*, *Embarked* and *Age*. We then choose to drop the 'Cabin' columns due to its too great number of missing values and to drop the passenger who haven't their age repertoried in the dataset.

In [None]:
df_preprocessed = df.drop(columns = ['Cabin','Name','Ticket'])

In [None]:
df_preprocessed = df_preprocessed.dropna()

In [None]:
df_preprocessed['Fare'].plot(kind='box', vert=False, figsize=(14,6))

In [None]:
bins = np.linspace(df_preprocessed.Age.min(), df_preprocessed.Age.max(), 10)
g = sns.FacetGrid(df, col="Sex", hue="Survived", palette="Set1", col_wrap=2)
g.map(plt.hist, 'Age', bins=bins, ec="k")

g.axes[-1].legend()
plt.show()

In [None]:
df_preprocessed.groupby(['Sex'])['Survived'].value_counts(normalize=True)

Here one can see that on the preprocessed data that the women seems to survive better than men and that the younger men are also more prone to survive. Now let's convert male to 0 and female to 1:

In [None]:
df_preprocessed['Sex'].replace(to_replace=['male','female'], value=[0,1],inplace=True)
df_preprocessed.head()

We will also encode in one hot the values the embarked gate characteristics

In [None]:
df_preprocessed = pd.concat([df_preprocessed,pd.get_dummies(df_preprocessed['Embarked'])], axis=1)
df_preprocessed.drop(['Embarked'], axis = 1,inplace=True)
df_preprocessed.head()

In [None]:
corr = df_preprocessed.corr()

corr

In [None]:
fig = plt.figure(figsize=(8,8))
plt.matshow(corr, cmap='RdBu', fignum=fig.number)
plt.xticks(range(len(corr.columns)), corr.columns, rotation='vertical');
plt.yticks(range(len(corr.columns)), corr.columns);

In [None]:
X = df_preprocessed[['PassengerId','Pclass','Sex','Age','Parch','Fare','C','Q','S']]
names = ['PassengerId','Pclass','Sex','Age','Parch','Fare','C','Q','S'] # variable names
y = df_preprocessed['Survived']

In [None]:
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from time import time
from sklearn.svm import l1_min_c

X = preprocessing.StandardScaler().fit(X).transform(X)

In [None]:
cs = l1_min_c(X, y, loss='log') * np.logspace(0, 4, 30)

print("Computing regularization path ...")
start = time()
clf = LogisticRegression(penalty='l1', solver='saga',
                                      tol=1e-6, max_iter=int(1e6),
                                      warm_start=True)
coefs_ = []
beta_l1norm = []
for c in cs:
    clf.set_params(C=c)
    clf.fit(X, y)
    beta_l1norm.append( np.sum(np.abs(clf.coef_.ravel()))) 
    coefs_.append(clf.coef_.ravel().copy())
print("This took %0.3fs" % (time() - start))

betas = np.array(coefs_)

In [None]:
# Display lasso path Vs l1 norm of the coeff vector
plt.figure(figsize=(12,6))
#plt.plot(np.log10(cs), coefs_, marker='o')
plt.plot(beta_l1norm, betas, marker='o')
ymin, ymax = plt.ylim()
plt.xlabel('l1 norm of beta')
plt.ylabel('Coefficients')
plt.title('Logistic Regression Path')
plt.axis('tight')
plt.legend(names, fontsize=14)
plt.grid('On')
plt.show()

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

print("Computing K-fold CV ...")
# K fold cross validation  (K=5)
start = time()
cs = l1_min_c(X_train, y_train, loss='log') * np.logspace(0, 2, 50) # the vector for the alpha (lasso penalty parameter) values
model = LogisticRegressionCV(Cs=cs, cv=5, penalty='l1', solver='saga', tol=1e-6).fit(X_train,y_train)
print("This took %0.3fs" % (time() - start))

In [None]:
# Now model is tuned with the penalty parameter estimated by CV
lambda_cv = model.C_[0]
# The coef estimated with CV
beta_l1norm = np.sum(np.abs(model.coef_))

print('CV estimates:')
print('- lambda = {:.3f}, which yields ||beta||_1 = {:.3f}\n'.format(lambda_cv,beta_l1norm) )
print('CV weights for standardized variables:')
betas_cv = pd.DataFrame.from_records(model.coef_, columns=names, index=['Weights'])
betas_cv['intercept'] = clf.intercept_
betas_cv.head()

In [None]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)
print("\nResults for the Test data set: %0.3f" % accuracy_score(y_test, y_pred) )

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [None]:
# Confusion matrix analysis

cl = [0,1]
y_pred = clf.predict(X_test)
cm =confusion_matrix(y_test, y_pred)

df_cm = pd.DataFrame(cm, index = [i for i in cl],
                  columns = [i for i in cl])
plt.figure(figsize = (5,4))
sns.heatmap(df_cm, annot=True, fmt='g', cmap='Blues')
plt.xlabel('Predicted classes')
plt.ylabel('Actual classes')
plt.title('Confusion matrix')
plt.show()

In [None]:
# further indices
p = precision_score(y_test, y_pred, average=None)
r = recall_score(y_test, y_pred, average=None)
fs = f1_score(y_test, y_pred, average=None)
df_metrics = pd.DataFrame({'Precision':p,
                            'Recall':r,
                            'F1 Score': fs},
                            index = cl)
df_metrics