In [1]:
%matplotlib inline

In [3]:
import numpy as np
import pandas as pd

In [4]:
titanic_train = pd.read_csv("train_data.csv")
char_cabin = titanic_train["Cabin"].astype(str)
#titanic_train["Cabin"]
new_Cabin = np.array([cabin[0] for cabin in char_cabin])
titanic_train["Cabin"] = pd.Categorical(new_Cabin) 

new_age_var =  np.where(titanic_train["Age"].isnull(),
                       28,
                       titanic_train["Age"])
titanic_train["Age"] = new_age_var

In [5]:
from sklearn import linear_model
from sklearn import preprocessing 

In [6]:
# Initialize label encoder
label_encoder  = preprocessing.LabelEncoder()

#Convert Sex variable to numeric
encoded_sex = label_encoder.fit_transform(titanic_train["Sex"])
log_model = linear_model.LogisticRegression()
#Initialize logistic regression model
log_model.fit(X = pd.DataFrame(encoded_sex),
             y = titanic_train["Survived"])

#trained model intercept
print(log_model.intercept_)

#trained model coefficients
print(log_model.coef_)

[1.00027876]
[[-2.43010712]]


In [7]:
#Make predictions
preds= log_model.predict_proba(X = pd.DataFrame(encoded_sex))
preds = pd.DataFrame(preds)
preds.columns = ["Death_prob","Survival_prob"]
pd.crosstab(titanic_train["Sex"],preds.loc[:,"Survival_prob"])

Survival_prob,0.19312542897248655,0.7311133823315542
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0,314
male,577,0


In [8]:
#more x variables to numeric
encoded_class = label_encoder.fit_transform(titanic_train["Pclass"])
encoded_cabin =  label_encoder.fit_transform(titanic_train["Cabin"])

train_features = pd.DataFrame([encoded_class,encoded_cabin,encoded_sex,titanic_train["Age"]]).T
train_features.head()

Unnamed: 0,0,1,2,3
0,2.0,8.0,1.0,22.0
1,0.0,2.0,0.0,38.0
2,2.0,8.0,0.0,26.0
3,0.0,2.0,0.0,35.0
4,2.0,8.0,1.0,35.0


In [9]:
log_model = linear_model.LogisticRegression()
log_model.fit(X = train_features,
             y = titanic_train["Survived"])
print(log_model.intercept_)
print(log_model.coef_) #for the four independent variables

[3.32716302]
[[-0.90790164 -0.06426483 -2.43179802 -0.0265924 ]]


In [10]:
#predictions
prediction = log_model.predict(X = train_features)
pd.crosstab(prediction,titanic_train["Survived"])

Survived,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,463,98
1,86,244


In [11]:
accuracy = (463+244)/890
print(accuracy)

0.7943820224719101


In [12]:
#using sklearn to find accuracy
log_model.score(X=train_features, y = titanic_train["Survived"])

0.7934904601571269

In [13]:
from sklearn import metrics
metrics.confusion_matrix(y_true=titanic_train["Survived"],y_pred=prediction)
print(metrics.classification_report(y_true=titanic_train["Survived"],y_pred=prediction))

             precision    recall  f1-score   support

          0       0.83      0.84      0.83       549
          1       0.74      0.71      0.73       342

avg / total       0.79      0.79      0.79       891



In [14]:
#test data
titanic_test = pd.read_csv("test_data.csv")
char_cabin = titanic_test["Cabin"].astype(str)
new_Cabin = np.array([cabin[0] for cabin in char_cabin])
titanic_test["Cabin"] = pd.Categorical(new_Cabin)
new_age_var = np.where(titanic_test["Age"].isnull(),
                      28,
                      titanic_test["Age"])
titanic_test["Age"] = new_age_var

In [15]:
encoded_sex = label_encoder.fit_transform(titanic_test["Sex"])
encoded_class = label_encoder.fit_transform(titanic_test["Pclass"])
encoded_cabin = label_encoder.fit_transform(titanic_test["Cabin"])

test_features = pd.DataFrame([encoded_class,encoded_cabin,encoded_sex,titanic_test["Age"]]).T

In [16]:
#prediction
test_preds = log_model.predict(X=test_features)
submission =  pd.DataFrame({"PassengerId":titanic_test["PassengerId"],
                          "Survived":test_preds})
submission.to_csv("logistic_regression.csv",index=False)