# Exercises
1. Use the scikit-learn train_test_split() method (documentation here) to split the titanic data into a training data set (say, two thirds of the data) and a test data set (one third of the data).

In [57]:
import numpy as np
import seaborn as sns
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [11]:
titanic_df = sns.load_dataset('titanic')
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [14]:
clas = pd.get_dummies(titanic_df['class'])
clas

Unnamed: 0,First,Second,Third
0,False,False,True
1,True,False,False
2,False,False,True
3,True,False,False
4,False,False,True
...,...,...,...
886,False,True,False
887,True,False,False
888,False,False,True
889,True,False,False


In [15]:
gender = pd.get_dummies(titanic_df['sex'])
gender

Unnamed: 0,female,male
0,False,True
1,True,False
2,True,False
3,True,False
4,False,True
...,...,...
886,False,True
887,True,False
888,True,False
889,False,True


In [36]:
X = pd.concat([clas.First, clas.Second, gender.female, titanic_df.age], axis=1)
X = sm.add_constant(X)

# Convert boolean values to integers (0 and 1)
X[['First', 'Second', 'female']] = X[['First', 'Second', 'female']].astype(int)

X

Unnamed: 0,const,First,Second,female,age
0,1.0,0,0,0,22.0
1,1.0,1,0,1,38.0
2,1.0,0,0,1,26.0
3,1.0,1,0,1,35.0
4,1.0,0,0,0,35.0
...,...,...,...,...,...
886,1.0,0,1,0,27.0
887,1.0,1,0,1,19.0
888,1.0,0,0,1,
889,1.0,1,0,0,26.0


In [37]:
y = titanic_df['survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

In [51]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
    )

In [59]:
X_train

Unnamed: 0,const,First,Second,female,age
331,1.0,1,0,0,45.5
733,1.0,0,1,0,23.0
382,1.0,0,0,0,32.0
704,1.0,0,0,0,26.0
813,1.0,0,0,1,6.0
...,...,...,...,...,...
106,1.0,0,0,1,21.0
270,1.0,1,0,0,
860,1.0,0,0,0,41.0
435,1.0,1,0,1,14.0


In [60]:
X_test

Unnamed: 0,const,First,Second,female,age
709,1.0,0,0,0,
439,1.0,0,1,0,31.0
840,1.0,0,0,0,20.0
720,1.0,0,1,1,6.0
39,1.0,0,0,1,14.0
...,...,...,...,...,...
433,1.0,0,0,0,17.0
773,1.0,0,0,0,
25,1.0,0,0,1,38.0
84,1.0,0,1,1,17.0


In [61]:
y_train

331    0
733    0
382    0
704    0
813    0
      ..
106    1
270    0
860    0
435    1
102    0
Name: survived, Length: 712, dtype: int64

In [62]:
y_test

709    1
439    0
840    0
720    1
39     1
      ..
433    0
773    0
25     1
84     1
10     1
Name: survived, Length: 179, dtype: int64

2. Re-train the OLS model above using your newly-created training data set. Using the test data set, test the model's quality, e.g., with respect to:

   * Accuracy (using a standard 50% binary classification threshold)
   * Area under the receiving operator characteristic curve (we covered this in Calculus II).

In [52]:
model = sm.Logit(y_train, X_train, missing='drop') # some rows contain NaN

In [53]:
result = model.fit()

Optimization terminated successfully.
         Current function value: 0.467030
         Iterations 6


In [54]:
result.summary()

0,1,2,3
Dep. Variable:,survived,No. Observations:,572.0
Model:,Logit,Df Residuals:,567.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 29 Feb 2024",Pseudo R-squ.:,0.3083
Time:,09:59:06,Log-Likelihood:,-267.14
converged:,True,LL-Null:,-386.22
Covariance Type:,nonrobust,LLR p-value:,2.306e-50

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-1.4563,0.273,-5.343,0.000,-1.990,-0.922
First,2.2230,0.301,7.381,0.000,1.633,2.813
Second,1.2521,0.266,4.700,0.000,0.730,1.774
female,2.4833,0.226,10.972,0.000,2.040,2.927
age,-0.0277,0.008,-3.368,0.001,-0.044,-0.012


In [56]:
beta = result.params
beta

const    -1.456299
First     2.222981
Second    1.252119
female    2.483344
age      -0.027736
dtype: float64

In [65]:
y_pred_prob = result.predict(X_test)

In [66]:
y_pred = (y_pred_prob > 0.5).astype(int)

In [68]:
# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy score
print("Accuracy:", accuracy)

Accuracy: 0.7597765363128491
