In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [4]:
dataset = pd.read_csv('train.csv')

In [6]:
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [7]:
age = dataset['Age']

In [8]:
age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [9]:
#since age consists of NaN or null values, we have to remove it

#remove null or nan from age.
def lw(cols):
    age = cols[0]
    Pclass = cols[1]
    if pd.isnull(age):
        if Pclass == 1:
            return 38
        elif Pclass == 2:
            return 30
        elif Pclass == 3:
            return 25
        else:
            return 30
    else:
        return age


In [10]:
dataset['Age']=dataset[['Age','Pclass']].apply(lw,axis=1) 

In [11]:
age = dataset['Age']

In [18]:
x = dataset[['Age','Sex','Pclass','Parch','SibSp','Embarked']] 

In [13]:
x

Unnamed: 0,Age,Sex,Pclass,Parch,SibSp,Embarked
0,22.0,male,3,0,1,S
1,38.0,female,1,0,1,C
2,26.0,female,3,0,0,S
3,35.0,female,1,0,1,S
4,35.0,male,3,0,0,S
...,...,...,...,...,...,...
886,27.0,male,2,0,0,S
887,19.0,female,1,0,0,S
888,25.0,female,3,2,1,S
889,26.0,male,1,0,0,C


In [14]:
# since we can see that sex, pclass, parch, sibsp, embarked. all these are categorical values. 
# applying dummy variable functionality and removing multi-colinearity .

sex = dataset['Sex']
sex = pd.get_dummies(sex, drop_first=True)


#even pclass is a numerical categorical variable 
pclass = dataset['Pclass']  
pclass = pd.get_dummies(pclass,drop_first = True) 

#sibblings
sibsp = dataset['SibSp']
sibsp = pd.get_dummies(sibsp, drop_first=True)

#parent child
parch = dataset['Parch']
parch = pd.get_dummies(parch, drop_first=True)

embarked = dataset['Embarked']
embarked = pd.get_dummies(embarked,drop_first = True)

In [15]:
X = pd.concat([age,sex,parch,sibsp,pclass,embarked],axis = 1)

In [16]:
X

Unnamed: 0,Age,male,1,2,3,4,5,6,1.1,2.1,3.1,4.1,5.1,8,2.2,3.2,Q,S
0,22.0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1
1,38.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,26.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
3,35.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
4,35.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
887,19.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
888,25.0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,1
889,26.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [19]:
y = dataset['Survived']

In [20]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [21]:
model = LogisticRegression()

In [22]:
model.fit(X,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
#getting the test data ....
datasettest = pd.read_csv('train.csv')

In [27]:
#perform same things with this dataset ... 
datasettest['Age'] =  datasettest[['Age','Pclass']].apply(lw,axis=1) 
age = datasettest['Age']

sex = datasettest['Sex']
sex = pd.get_dummies(sex, drop_first=True)


#even pclass is a numerical categorical variable 
pclass = datasettest['Pclass']  
pclass = pd.get_dummies(pclass,drop_first = True) 

#sibblings
sibsp = datasettest['SibSp']
sibsp = pd.get_dummies(sibsp, drop_first=True)

#parent child
parch = datasettest['Parch']
parch = pd.get_dummies(parch, drop_first=True)

embarked = datasettest['Embarked']
embarked = pd.get_dummies(embarked,drop_first = True)

In [28]:
X_test = pd.concat([age,sex,parch,sibsp,pclass,embarked],axis = 1)

In [29]:
y_pred = model.predict(X_test)

In [30]:
y_pred

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,

In [31]:
y_test = datasettest['Survived']

In [32]:
y_test

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [33]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[486,  63],
       [100, 242]], dtype=int64)

In [35]:
record = 486+242+100+63

In [36]:
accurate = 486+242

In [37]:
error = 100+63

In [None]:
accuracy_rate = accurate / record * 100