In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LogisticRegression

# Loading data

In [2]:
# import training and testing data
training_data = pd.read_csv("C:\\Users\\hp\\Desktop\\CN-DSML\\logistic_regression\\0000000000002429_training_titanic_x_y_train.csv")

In [232]:
# import training and testing data
testing_data = pd.read_csv("C:\\Users\\hp\\Desktop\\CN-DSML\\logistic_regression\\0000000000002429_test_titanic_x_test.csv")

# Data Preprocessing and Cleaning

In [233]:
training_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29.0,1,0,228414,26.0,,S,1
1,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S,0
2,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39.0,0,0,250655,26.0,,S,0
3,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29.0,0,4,349909,21.075,,S,0
4,3,"Sutehall, Mr. Henry Jr",male,25.0,0,0,SOTON/OQ 392076,7.05,,S,0


In [234]:
testing_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,2,"Davies, Master. John Morgan Jr",male,8.0,1,1,C.A. 33112,36.75,,S
1,1,"Leader, Dr. Alice (Farnham)",female,49.0,0,0,17465,25.9292,D17,S
2,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
3,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Chr...",female,24.0,2,1,243847,27.0,,S
4,1,"McGough, Mr. James Robert",male,36.0,0,0,PC 17473,26.2875,E25,S


In [235]:
# delete column of name and ticket
training_data.drop("Name",inplace = True,axis = 1)
training_data.drop("Ticket",inplace = True,axis = 1)
training_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,2,female,29.0,1,0,26.0,,S,1
1,3,male,,0,0,8.05,,S,0
2,2,male,39.0,0,0,26.0,,S,0
3,3,female,29.0,0,4,21.075,,S,0
4,3,male,25.0,0,0,7.05,,S,0


In [236]:
# convert male to 0 and female to 1
def g(s):
    if s == "male":
        return 0
    elif s == "female":
        return 1

In [237]:
training_data["Sex"] = training_data["Sex"].apply(g)
training_data.head(2)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Survived
0,2,1,29.0,1,0,26.0,,S,1
1,3,0,,0,0,8.05,,S,0


In [238]:
training_data.isnull().sum()

Pclass        0
Sex           0
Age         132
SibSp         0
Parch         0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64

In [239]:
training_data["Age"].fillna(training_data["Age"].mean(),inplace = True)
training_data.isnull().sum()

Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Fare          0
Cabin       514
Embarked      1
Survived      0
dtype: int64

In [240]:
training_data.loc[training_data['Cabin'].isnull(), 'Cabin']=0
training_data.loc[training_data['Cabin']!=0, 'Cabin']=1

In [241]:
training_data.head()
training_data.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    1
Survived    0
dtype: int64

In [242]:
training_data["Embarked"].fillna("S",inplace = True)
training_data.isnull().sum()

Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
Survived    0
dtype: int64

In [243]:
# convert Embarked to number where S = 1,Q=2,C = 3
def start(p):
    if p == "S":
        return 1
    elif p == "Q":
        return 2
    else:
        return 3

In [244]:
training_data["Embarked"] = training_data["Embarked"].apply(start)

In [245]:
training_data.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
count,668.0,668.0,668.0,668.0,668.0,668.0,668.0,668.0
mean,2.296407,0.360778,29.70056,0.528443,0.407186,32.064552,1.473054,0.402695
std,0.831638,0.480586,12.753571,1.080327,0.854695,45.320835,0.805263,0.490808
min,1.0,0.0,0.67,0.0,0.0,0.0,1.0,0.0
25%,2.0,0.0,23.0,0.0,0.0,7.925,1.0,0.0
50%,3.0,0.0,29.70056,0.0,0.0,14.75,1.0,0.0
75%,3.0,1.0,35.0,1.0,0.0,31.275,2.0,1.0
max,3.0,1.0,80.0,8.0,6.0,512.3292,3.0,1.0


In [246]:
X_train = training_data.values[:,:-1]
Y_train = training_data.values[:,-1].astype('int')

In [247]:
# working on testing data 

# delete column of name and ticket
testing_data.drop("Name",inplace = True,axis = 1)
testing_data.drop("Ticket",inplace = True,axis = 1)

# convert male to 0 and female to 1
testing_data["Sex"] = testing_data["Sex"].apply(g)

# fill nan of age column with age mean value
testing_data["Age"].fillna(testing_data["Age"].mean(),inplace = True)

# convert cabin column value with 1 and nan with 0
testing_data.loc[testing_data['Cabin'].isnull(), 'Cabin']=0
testing_data.loc[testing_data['Cabin']!=0, 'Cabin']=1

# convert Embarked to number where S = 1,Q=2,C = 3
testing_data["Embarked"].fillna("S",inplace = True)
testing_data["Embarked"] = testing_data["Embarked"].apply(start)

x_test = testing_data.values

# Logistic Regression main function 

In [248]:
clf = LogisticRegression(solver='saga', max_iter=10000000, tol=0.00001)

In [249]:
clf.fit(X_train,Y_train)

LogisticRegression(max_iter=10000000, solver='saga', tol=1e-05)

# Getting Y predictions for test data


In [250]:
clf.score(X_train,Y_train)

0.7904191616766467

In [251]:
y_pred = clf.predict(x_test)

In [252]:
# np array convert in to csv file:-
np.savetxt("Logistic_regression_new.csv", y_pred,delimiter=',')