In [1]:
import numpy as np
import random
import math
import pandas as pd
from warnings import catch_warnings, warn
from scipy import stats
import statistics

In [2]:
#loading tes and train files as pandas dataframe
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

In [3]:
train_dataset.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [4]:
test_dataset.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [5]:
#loading a dataset into pandas dataframe 
titanic_dataset = pd.concat([train_dataset, test_dataset], ignore_index=True)

In [6]:
#count of titanic dataset
titanic_dataset.count()

PassengerId    1309
Survived        891
Pclass         1309
Name           1309
Sex            1309
Age            1046
SibSp          1309
Parch          1309
Ticket         1309
Fare           1308
Cabin           295
Embarked       1307
dtype: int64

In [7]:
#checking columns which has null values in titanic_dataset
titanic_dataset.isna().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [8]:
#filling null values for embarked with mode values 
titanic_dataset['Embarked'] = titanic_dataset['Embarked'].fillna(titanic_dataset['Embarked'].mode())

In [9]:
#normalizing the Fare column for titanic dataset
titanic_dataset['Fare']=titanic_dataset['Fare'].fillna(titanic_dataset['Fare'].mean())
max_fare=titanic_dataset['Fare'][:].max()
titanic_dataset['Fare']=titanic_dataset['Fare'][:]/max_fare

In [10]:
#validating Fare column normalization in titanic dataset
titanic_dataset[titanic_dataset['Fare']>1].shape[0]

0

In [11]:
#converting the categorical fields to numerical value
titanic_dataset['Sex'] = titanic_dataset['Sex'].astype('category').cat.codes
titanic_dataset['Embarked'] = titanic_dataset['Embarked'].astype('category').cat.codes

In [12]:
#validating column datatypes
titanic_dataset.dtypes

PassengerId      int64
Survived       float64
Pclass           int64
Name            object
Sex               int8
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked          int8
dtype: object

In [13]:
#selecting required columns from titanic dataset
# titanic_dataset_selected = titanic_dataset[['PassengerId','Survived','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
titanic_dataset_selected = titanic_dataset[['PassengerId','Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

In [14]:
#splitting the dataset into test and train 
titanic_dataset_train = titanic_dataset_selected[~titanic_dataset_selected['Age'].isna()]
titanic_dataset_test = titanic_dataset_selected[titanic_dataset_selected['Age'].isna()]

In [15]:
#printing sample rows for train
titanic_dataset_train[:10]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1,22.0,1,0,0.014151,2
1,2,1,0,38.0,1,0,0.139136,0
2,3,3,0,26.0,0,0,0.015469,2
3,4,1,0,35.0,1,0,0.103644,2
4,5,3,1,35.0,0,0,0.015713,2
6,7,1,1,54.0,0,0,0.101229,2
7,8,3,1,2.0,3,1,0.041136,2
8,9,3,0,27.0,0,2,0.021731,2
9,10,2,0,14.0,1,0,0.058694,0
10,11,3,0,4.0,1,1,0.032596,2


In [16]:
#validating age column in train
titanic_dataset_train[titanic_dataset_train['Age'].isna()].shape[0]

0

In [17]:
#printing sample rows for test
titanic_dataset_test[:10]

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
5,6,3,1,,0,0,0.01651,1
17,18,2,1,,0,0,0.025374,2
19,20,3,0,,0,0,0.014102,0
26,27,3,1,,0,0,0.014102,0
28,29,3,0,,0,0,0.015379,1
29,30,3,1,,0,0,0.015412,2
31,32,1,0,,1,0,0.28599,0
32,33,3,0,,0,0,0.015127,1
36,37,3,1,,0,0,0.01411,0
42,43,3,1,,0,0,0.015412,0


In [18]:
#validating age column in test
titanic_dataset_test[~titanic_dataset_test['Age'].isna()].shape[0]

0

In [19]:
#converting to numpy 2-d array for training and testing
# titanic_dataset_train_np = titanic_dataset_train[['Survived','Pclass','Sex','SibSp','Parch','Fare','Embarked']].to_numpy()
titanic_dataset_train_np = titanic_dataset_train[['Pclass','Sex','SibSp','Parch','Fare','Embarked']].to_numpy()
titanic_dataset_train_labels = titanic_dataset_train['Age'].to_numpy() 
# titanic_dataset_test_np = titanic_dataset_test[['Survived','Pclass','Sex','SibSp','Parch','Fare','Embarked']].to_numpy()
titanic_dataset_test_np = titanic_dataset_test[['Pclass','Sex','SibSp','Parch','Fare','Embarked']].to_numpy()

In [20]:
def compute_linear(X,W,b):
    return np.dot(X,W)+b

In [21]:
def compute_cost(X,Y,W,b):
    row_count,column_count=X.shape
    J_wb=0
    for row in range(row_count):
        J_wb+=(compute_linear(X[row],W,b)-Y[row])**2
    return J_wb/(2*row_count)

In [22]:
def compute_differential_parameters(X,Y,W,b):
    row_count,column_count=X.shape
    dj_dw=np.zeros(X.shape[1])
    dj_db=0
    for row in range(row_count):
        f_wb=compute_linear(X[row],W,b)
        err=f_wb-Y[row]
        dj_db+=err
        for column in range(column_count):
            with catch_warnings(record=True) as w:
                dj_dw[column]+=(err*X[row,column])
            if w:
              print(w)
              print("f_wb",f_wb,"err :",err,"X[row,column] :",X[row,column],"err*X[row,column] :",err*X[row,column],"dj_dw[column] :",dj_dw[column],"row :",row,"column :",column)
    dj_db/=row_count
    dj_dw/=row_count
    return dj_db,dj_dw



In [23]:
def compute_gradient_descent(X,Y,W,b,a):
    J_wb=compute_cost(X,Y,W,b)
    for i in range(10000):
        dj_db,dj_dw=compute_differential_parameters(X,Y,W,b)
        W=W-(a*dj_dw)
        b=b-(a*dj_db)
    J_wb=compute_cost(X,Y,W,b)
    print(J_wb)
    return b,W

In [24]:
#training logistic regression for titanic dataset
#initializing weights and bias
w=np.zeros(titanic_dataset_train_np.shape[1])
b=0.0
#setting learning rate
a=0.1
trained_b,trained_w=compute_gradient_descent(titanic_dataset_train_np,titanic_dataset_train_labels,w,b,a)

79.8940743401509


In [25]:
#predicting age
predicted_age=compute_linear(titanic_dataset_test_np,trained_w,trained_b)

In [26]:
#assigning age column to test dataframe
titanic_dataset_test['Age'] = predicted_age

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_dataset_test['Age'] = predicted_age


In [27]:
#mean age in the predicted age
mean_age = predicted_age.mean()

In [28]:
#creating a list without negative results
list_without_neg=[]
for element in range(len(predicted_age)):
    if predicted_age[element]<0:
        print(predicted_age[element],element)
    else:
        list_without_neg.append(predicted_age[element])

-2.095111280743474 35
-2.095111280743474 161
-2.095111280743474 173
-2.095111280743474 214


In [29]:
len(list_without_neg)

259

In [30]:
mean_age = statistics.mean(list_without_neg)

In [31]:
#converting negative values to 1
titanic_dataset_test.loc[titanic_dataset_test['Age'] < 0, 'Age'] = mean_age

In [32]:
#validating age column in test dataframe
titanic_dataset_test[titanic_dataset_test['Age']<0].shape[0]

0

In [33]:
#printing sample rows of test and train
titanic_dataset_test

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
5,6,3,1,26.866071,0,0,0.016510,1
17,18,2,1,34.502011,0,0,0.025374,2
19,20,3,0,23.598653,0,0,0.014102,0
26,27,3,1,26.328571,0,0,0.014102,0
28,29,3,0,24.134648,0,0,0.015379,1
...,...,...,...,...,...,...,...,...
1299,1300,3,0,24.134236,0,0,0.015070,1
1301,1302,3,0,24.134312,0,0,0.015127,1
1304,1305,3,1,27.399303,0,0,0.015713,2
1307,1308,3,1,27.399303,0,0,0.015713,2


In [34]:
#printing sample rows of test and train
titanic_dataset_train

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,1,22.0,1,0,0.014151,2
1,2,1,0,38.0,1,0,0.139136,0
2,3,3,0,26.0,0,0,0.015469,2
3,4,1,0,35.0,1,0,0.103644,2
4,5,3,1,35.0,0,0,0.015713,2
...,...,...,...,...,...,...,...,...
1300,1301,3,0,3.0,1,1,0.026887,2
1302,1303,1,0,37.0,1,0,0.175668,1
1303,1304,3,0,28.0,0,0,0.015176,2
1305,1306,1,0,39.0,0,0,0.212559,0


In [35]:
#unioning train and test datasets
titanic_dataset_concated=pd.concat([titanic_dataset_train, titanic_dataset_test])
titanic_dataset_merge = pd.merge(left = titanic_dataset,right = titanic_dataset_concated,left_on = 'PassengerId', right_on = 'PassengerId',how = 'inner')

In [39]:
#count of titanic dataset
titanic_dataset_merge.columns

Index(['PassengerId', 'Survived', 'Pclass_x', 'Name', 'Sex_x', 'Age_x',
       'SibSp_x', 'Parch_x', 'Ticket', 'Fare_x', 'Cabin', 'Embarked_x',
       'Pclass_y', 'Sex_y', 'Age_y', 'SibSp_y', 'Parch_y', 'Fare_y',
       'Embarked_y'],
      dtype='object')

In [41]:
#Selecting required columns from titanic_dataset_merge
titanic_dataset_merge_selected = titanic_dataset_merge[['PassengerId','Survived','Pclass_y', 'Sex_y', 'Age_y', 'SibSp_y', 'Parch_y', 'Fare_y',
       'Embarked_y']]
titanic_dataset_merge_selected = titanic_dataset_merge_selected.rename(columns={'Pclass_y': 'Pclass', 'Sex_y': 'Sex', 'Age_y': 'Age','SibSp_y': 'SibSp','Parch_y': 'Parch','Fare_y': 'Fare','Embarked_y': 'Embarked'})

In [55]:
#ROUNDING THE AGE 
titanic_dataset_merge_selected['Age']=titanic_dataset_merge_selected['Age'].round()

In [57]:
#printing sample rows 
titanic_dataset_merge_selected

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0.0,3,1,22.0,1,0,0.014151,2
1,2,1.0,1,0,38.0,1,0,0.139136,0
2,3,1.0,3,0,26.0,0,0,0.015469,2
3,4,1.0,1,0,35.0,1,0,0.103644,2
4,5,0.0,3,1,35.0,0,0,0.015713,2
...,...,...,...,...,...,...,...,...,...
1304,1305,,3,1,27.0,0,0,0.015713,2
1305,1306,,1,0,39.0,0,0,0.212559,0
1306,1307,,3,1,38.0,0,0,0.014151,2
1307,1308,,3,1,27.0,0,0,0.015713,2


In [59]:
#writing to disk in csv format
titanic_dataset_merge_selected[:891].to_csv('train_2.csv',header = True , index = False)
titanic_dataset_merge_selected[891:].to_csv('test_2.csv',header = True , index = False)