In [1]:
import numpy as np
import pandas as pd

In [2]:
titanic_train = pd.read_csv("Titanic_x_y_train.csv")
titanic_train.shape

(668, 11)

In [3]:
titanic_train.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


In [4]:
#cleaning data
#name, ticket, fare, cabin can't detemine survival of person so we delete these columns
del titanic_train["Name"]
del titanic_train["Ticket"]
del titanic_train["Fare"]
del titanic_train["Cabin"]

titanic_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked,Survived
0,2,female,29.0,1,0,S,1
1,3,male,,0,0,S,0
2,2,male,39.0,0,0,S,0
3,3,female,29.0,0,4,S,0
4,3,male,25.0,0,0,S,0


In [5]:
titanic_train.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Survived
count,668.0,536.0,668.0,668.0,668.0
mean,2.296407,29.70056,0.528443,0.407186,0.402695
std,0.831638,14.240257,1.080327,0.854695,0.490808
min,1.0,0.67,0.0,0.0,0.0
25%,2.0,21.0,0.0,0.0,0.0
50%,3.0,29.0,0.0,0.0,0.0
75%,3.0,38.25,1.0,0.0,1.0
max,3.0,80.0,8.0,6.0,1.0


In [6]:
#checking Nan values in the dataset
titanic_train.isnull().sum()

Pclass        0
Sex           0
Age         132
SibSp         0
Parch         0
Embarked      1
Survived      0
dtype: int64

In [7]:
#function to convert gender string to number male=1, female=0
def sexToGender_num(s):
    if(s == "male"):
        return 1
    else:
        return 0    

In [8]:
#converting Sex column from string to numeric value
titanic_train["Gender"] = titanic_train["Sex"].apply(sexToGender_num)
del titanic_train["Sex"]
titanic_train.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Embarked,Survived,Gender
0,2,29.0,1,0,S,1,0
1,3,,0,0,S,0,1
2,2,39.0,0,0,S,0,1
3,3,29.0,0,4,S,0,0
4,3,25.0,0,0,S,0,1
5,3,34.5,0,0,C,0,1
6,1,35.0,1,0,S,1,0
7,3,23.0,0,0,S,0,1
8,2,,0,0,S,1,1
9,3,50.0,0,0,S,0,1


In [9]:
titanic_train["Embarked"].mode()

0    S
dtype: object

In [10]:
#filling NaN in Embarked column by the most repeating value
titanic_train.Embarked.fillna(titanic_train["Embarked"].mode()[0], inplace = True)

In [11]:
titanic_train.isnull().sum()

Pclass        0
Age         132
SibSp         0
Parch         0
Embarked      0
Survived      0
Gender        0
dtype: int64

In [12]:
#converting Embarked to numeric value
def EmbarkedToNumEmbark(s):
    if(s == "S"):
        return 0
    elif(s == "C"):
        return 1
    else:
        return 2

In [13]:
titanic_train["Embark"] = titanic_train["Embarked"].apply(EmbarkedToNumEmbark)
del titanic_train["Embarked"]
titanic_train.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Survived,Gender,Embark
0,2,29.0,1,0,1,0,0
1,3,,0,0,0,1,0
2,2,39.0,0,0,0,1,0
3,3,29.0,0,4,0,0,0
4,3,25.0,0,0,0,1,0
5,3,34.5,0,0,0,1,1
6,1,35.0,1,0,1,0,0
7,3,23.0,0,0,0,1,0
8,2,,0,0,1,1,0
9,3,50.0,0,0,0,1,0


In [14]:
#calculating the mean age of survived and not survived people
meanAge_S = titanic_train[titanic_train["Survived"] == 1]["Age"].mean()
print(meanAge_S)
meanAge_NS = titanic_train[titanic_train["Survived"] == 0]["Age"].mean()
print(meanAge_NS)

28.138157894736842
30.857142857142858


In [15]:
#filling NaN or Null values in Age column
titanic_train["AGE"] = np.where(pd.isnull(titanic_train["Age"]) & titanic_train["Survived"]==1, meanAge_S, titanic_train["Age"])
titanic_train["AGE"].fillna(meanAge_NS, inplace = True)
del titanic_train["Age"]

In [16]:
titanic_train.head()

Unnamed: 0,Pclass,SibSp,Parch,Survived,Gender,Embark,AGE
0,2,1,0,1,0,0,29.0
1,3,0,0,0,1,0,30.857143
2,2,0,0,0,1,0,39.0
3,3,0,4,0,0,0,29.0
4,3,0,0,0,1,0,25.0


In [17]:
titanic_train.describe()

Unnamed: 0,Pclass,SibSp,Parch,Survived,Gender,Embark,AGE
count,668.0,668.0,668.0,668.0,668.0,668.0,668.0
mean,2.296407,0.528443,0.407186,0.402695,0.639222,0.348802,29.762222
std,0.831638,1.080327,0.854695,0.490808,0.480586,0.614333,12.766452
min,1.0,0.0,0.0,0.0,0.0,0.0,0.67
25%,2.0,0.0,0.0,0.0,0.0,0.0,23.0
50%,3.0,0.0,0.0,0.0,1.0,0.0,30.0
75%,3.0,1.0,0.0,1.0,1.0,1.0,35.0
max,3.0,8.0,6.0,1.0,1.0,2.0,80.0


In [18]:
titanic_train.isnull().sum()

Pclass      0
SibSp       0
Parch       0
Survived    0
Gender      0
Embark      0
AGE         0
dtype: int64

In [25]:
titanic_train["Survive"] = titanic_train["Survived"]
del titanic_train["Survived"]
titanic_train.head()

Unnamed: 0,Pclass,SibSp,Parch,Gender,Embark,AGE,Survive
0,2,1,0,0,0,29.0,1
1,3,0,0,1,0,30.857143,0
2,2,0,0,1,0,39.0,0
3,3,0,4,0,0,29.0,0
4,3,0,0,1,0,25.0,0


In [26]:
titanic_trainData = titanic_train.values
x_train = titanic_trainData[:,:-1]
y_train = titanic_trainData[:,-1]
print(x_train.shape)
print(y_train.shape)

(668, 6)
(668,)


In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
clf01 = LogisticRegression()
clf01.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
from sklearn.metrics import confusion_matrix, classification_report

In [31]:
y_train_pred = clf01.predict(x_train)

In [32]:
confusion_matrix(y_train, y_train_pred)

array([[349,  50],
       [ 84, 185]], dtype=int64)

In [35]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

         0.0       0.81      0.87      0.84       399
         1.0       0.79      0.69      0.73       269

   micro avg       0.80      0.80      0.80       668
   macro avg       0.80      0.78      0.79       668
weighted avg       0.80      0.80      0.80       668



In [55]:
clf02 = LogisticRegression(C=2, max_iter=1000, solver="saga", multi_class="multinomial")
clf02.fit(x_train, y_train)

LogisticRegression(C=2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='multinomial',
          n_jobs=None, penalty='l2', random_state=None, solver='saga',
          tol=0.0001, verbose=0, warm_start=False)

In [56]:
y2_train_pred = clf02.predict(x_train)

In [57]:
confusion_matrix(y_train, y2_train_pred)

array([[343,  56],
       [ 79, 190]], dtype=int64)

In [58]:
print(classification_report(y_train, y2_train_pred))

              precision    recall  f1-score   support

         0.0       0.81      0.86      0.84       399
         1.0       0.77      0.71      0.74       269

   micro avg       0.80      0.80      0.80       668
   macro avg       0.79      0.78      0.79       668
weighted avg       0.80      0.80      0.80       668



In [60]:
titanic_test = pd.read_csv("Test_titanic_x_test.csv")
titanic_test.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.75,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S


In [62]:
titanic_test.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
count,223.0,178.0,223.0,223.0,223.0
mean,2.345291,29.694775,0.506726,0.304933,32.622551
std,0.850047,15.398053,1.1697,0.634108,61.062047
min,1.0,0.42,0.0,0.0,0.0
25%,2.0,19.25,0.0,0.0,7.8792
50%,3.0,27.0,0.0,0.0,12.475
75%,3.0,37.75,1.0,0.0,30.0354
max,3.0,71.0,8.0,2.0,512.3292


In [63]:
del titanic_test["Name"]
del titanic_test["Ticket"]
del titanic_test["Fare"]
del titanic_test["Cabin"]

titanic_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,2,male,8.0,1,1,S
1,1,female,49.0,0,0,S
2,3,male,,0,0,Q
3,2,female,24.0,2,1,S
4,1,male,36.0,0,0,S


In [64]:
titanic_test["Gender"] = titanic_test["Sex"].apply(sexToGender_num)
del titanic_test["Sex"]
titanic_test.head(10)

Unnamed: 0,Pclass,Age,SibSp,Parch,Embarked,Gender
0,2,8.0,1,1,S,1
1,1,49.0,0,0,S,0
2,3,,0,0,Q,1
3,2,24.0,2,1,S,0
4,1,36.0,0,0,S,1
5,1,71.0,0,0,C,1
6,3,16.0,0,0,S,1
7,3,,0,0,Q,1
8,3,18.0,0,0,S,0
9,3,,0,0,S,1


In [65]:
titanic_test.isnull().sum()

Pclass       0
Age         45
SibSp        0
Parch        0
Embarked     1
Gender       0
dtype: int64

In [68]:
titanic_test["Embarked"].fillna(titanic_test["Embarked"].mode()[0], inplace = True)

In [69]:
titanic_test.isnull().sum()

Pclass       0
Age         45
SibSp        0
Parch        0
Embarked     0
Gender       0
dtype: int64

In [70]:
titanic_test["Embark"] = titanic_test["Embarked"].apply(EmbarkedToNumEmbark)
del titanic_test["Embarked"]
titanic_test.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Gender,Embark
count,223.0,178.0,223.0,223.0,223.0,223.0
mean,2.345291,29.694775,0.506726,0.304933,0.672646,0.399103
std,0.850047,15.398053,1.1697,0.634108,0.470303,0.695804
min,1.0,0.42,0.0,0.0,0.0,0.0
25%,2.0,19.25,0.0,0.0,0.0,0.0
50%,3.0,27.0,0.0,0.0,1.0,0.0
75%,3.0,37.75,1.0,0.0,1.0,1.0
max,3.0,71.0,8.0,2.0,1.0,2.0


In [73]:
meanAge = titanic_test["Age"].mean()
meanAge

29.694775280898877

In [74]:
titanic_test["Age"].fillna(meanAge, inplace = True)
titanic_test.describe()

Unnamed: 0,Pclass,Age,SibSp,Parch,Gender,Embark
count,223.0,223.0,223.0,223.0,223.0,223.0
mean,2.345291,29.694775,0.506726,0.304933,0.672646,0.399103
std,0.850047,13.749153,1.1697,0.634108,0.470303,0.695804
min,1.0,0.42,0.0,0.0,0.0,0.0
25%,2.0,22.0,0.0,0.0,0.0,0.0
50%,3.0,29.694775,0.0,0.0,1.0,0.0
75%,3.0,35.0,1.0,0.0,1.0,1.0
max,3.0,71.0,8.0,2.0,1.0,2.0


In [75]:
titanic_test["AGE"] = titanic_test["Age"]
del titanic_test["Age"]
titanic_test.head()

Unnamed: 0,Pclass,SibSp,Parch,Gender,Embark,AGE
0,2,1,1,1,0,8.0
1,1,0,0,0,0,49.0
2,3,0,0,1,2,29.694775
3,2,2,1,0,0,24.0
4,1,0,0,1,0,36.0


In [76]:
x_test = titanic_test.values
y_test_pred = clf02.predict(x_test)

In [78]:
print(y_test_pred.shape)
y_test_pred

(223,)


array([0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0.,
       1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1.,
       1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0.,
       1., 0., 1., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0.,
       0., 1., 0., 0., 1., 0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 1., 1.,
       1., 1., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1.,
       0., 1., 0., 0., 0., 1., 0., 0., 1., 1., 1., 1., 1., 0., 1., 0., 0.,
       0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
       0., 0.])

In [79]:
np.savetxt("Titanic_y_result.csv", y_test_pred, delimiter=",", fmt="%d")