In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler # 1-hot encoding (converting catagorical data into binary form)
from sklearn.model_selection import train_test_split #to split the dataset into training adn testing
from sklearn.linear_model import LogisticRegression #used for training logistic regression
from sklearn.metrics import accuracy_score,precision_score, recall_score #accuracy, precision and recall metrics
from sklearn.model_selection import cross_val_score, StratifiedKFold #validation
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.utils.class_weight import compute_class_weight # for computing class weights for imbalanced classes

In [4]:
# loading the dataset
df = pd.read_csv('train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
    """
        OBJECTIVE : our aim is to create a model that will predict the survival rate basedd on age, sex, pclass, etc. 
    """

df['Survived'].isna() #you can see that the values are in binary form i.e., 0-> not survived and 1-> survived. for this type of prediction we can use logistic regression.

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Survived, Length: 891, dtype: bool

In [7]:
# EXPLORATORY DATA ANALYSIS

df.isnull().sum()# in the dataset, you can see that columns 'Age', 'Cabin' and 'Embarked' are missing some data values.


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [8]:
# for 'Age' column, we take average age of all the other passengers with the same 'Pclass'
def impute_age(col):
    Age = col[0]
    Pclass = col[1]
    if pd.isnull(Age):
        return df[df['Pclass'] == Pclass]['Age'].mean()
    else:
        return Age
df['Age'] = df[['Age', 'Pclass']].apply(impute_age, axis = 1)
df.isnull().sum()

  Age = col[0]
  Pclass = col[1]


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [9]:
# for 'Cabin' column, we can see that more than 80% of the data is missing, so its better to drop the column.
df.drop(columns= ['Cabin'], inplace = True)
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [10]:
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)
df.isnull().sum()

# now there are no missing values.

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace = True)


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [11]:
# now you can see that for columns like 'sex', the values are either 'male' or 'female'. but the logestic regression model doesn't understand words, so we convert it to a numberic value.

for i in ['Sex', 'Pclass', 'Embarked']:
    dummies  = pd.get_dummies(df[i], prefix= i)
    df = pd.concat([df, dummies], axis =1)

for i in ['Sex_male', 'Sex_female', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S'] : 
    df[i] = df[i].map({True : 1, False : 0})


In [12]:
df.columns


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Embarked', 'Sex_female', 'Sex_male',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q',
       'Embarked_S'],
      dtype='object')

In [13]:
# dropping unnecessary columns
df.drop(columns = ['PassengerId', 'Pclass', 'Name','Sex', 'Ticket', 'Embarked', 'Fare'], inplace = True)
df.head(3).T

Unnamed: 0,0,1,2
Survived,0.0,1.0,1.0
Age,22.0,38.0,26.0
SibSp,1.0,1.0,0.0
Parch,0.0,0.0,0.0
Sex_female,0.0,1.0,1.0
Sex_male,1.0,0.0,0.0
Pclass_1,0.0,1.0,0.0
Pclass_2,0.0,0.0,0.0
Pclass_3,1.0,0.0,1.0
Embarked_C,0.0,1.0,0.0


In [14]:
# SPLITTING THE DATASET INTO TRAINING AND TESTING
X = df[['Age', 'SibSp', 'Parch', 'Sex_female', 'Sex_male',
       'Pclass_1', 'Pclass_2', 'Pclass_3']]

Y = df['Survived']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state= 45)


In [15]:
# TRAINING THE LOGISTIC REGRESSION MODEL
logistic_regression = LogisticRegression(max_iter=500)
logistic_regression.fit(x_train, y_train)


In [16]:
predicted_value = logistic_regression.predict(x_test)

In [17]:
predicted_value

array([0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 0], dtype=int64)

In [18]:
# testing accuracy, precision and recall

accuracy = accuracy_score(y_test, predicted_value)
print('accuracy = ',accuracy)
# accuracy = correct predictions/total predictions
# NOTE: generally 'accuracy' is not useful for imbalanced datasets.


# precision -> measure of correctness achieved in true predictions
# precision = predictions actually positive/total predicted positive
precision = precision_score(y_test, predicted_value)
print('precision = ',precision)


# recall = predictions actually positive/total actual positive
recall = recall_score(y_test, predicted_value)
print('recall = ',recall)



accuracy =  0.8603351955307262
precision =  0.7936507936507936
recall =  0.8064516129032258


## NOTE : you can also use l1, l2 and elasticnet regularization in logistic regression

In [19]:
# accuracy =  0.8435754189944135
# precision =  0.765625
# recall =  0.7903225806451613


# accuracy =  0.8603351955307262
# precision =  0.7936507936507936
# recall =  0.8064516129032258

## cost-sensitive / weighted logistic regression : 
    optimizing logistic regression for imbalanced classes so that majority class does not got more importance than the minority class.

    here, the model is penalized less for errors made on the majority class and penalized more for errors made on the minority class. 

In [20]:
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,1,0,0,1,0,0,1,0,0,1
1,1,38.0,1,0,1,0,1,0,0,1,0,0
2,1,26.0,0,0,1,0,0,0,1,0,0,1
3,1,35.0,1,0,1,0,1,0,0,0,0,1
4,0,35.0,0,0,0,1,0,0,1,0,0,1


In [21]:
df.Survived.value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [22]:
x = df.drop('Survived', axis = 1)
y = df.Survived

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, stratify = y, shuffle = True, random_state = 1)

In [25]:
# scaling the dataset

scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.transform(xtest)

In [41]:
# model with no weight optimization 
model = LogisticRegression()
model.fit(xtrain, ytrain)

cv = StratifiedKFold(n_splits=8, shuffle=True)
print("cross val score on test set = ", 
      cross_val_score(model, xtest, ytest, cv = cv, scoring = 'accuracy').mean())

print("cross val score on train set = ", 
      cross_val_score(model, xtrain, ytrain, cv = cv, scoring = 'accuracy').mean())


cross val score on test set =  0.7996541501976284
cross val score on train set =  0.8047752808988764


The scikit-learn library provides an implementation of the best practice heuristic for the class weighting.

It is implemented via the compute_class_weight() function and is calculated as:

n_samples / (n_classes * n_samples_with_class)


In [42]:
# balancing the class weights

weights = compute_class_weight(class_weight='balanced', classes = [0, 1], y = y)
print(weights)

model = LogisticRegression(max_iter = 300, class_weight={0 : weights[0], 1 : weights[1]})
model.fit(xtrain, ytrain)

cv = StratifiedKFold(n_splits=8, shuffle=True)
print("cross val score on test set = ", 
      cross_val_score(model, xtest, ytest, cv = cv, scoring = 'accuracy').mean())

print("cross val score on train set = ", 
      cross_val_score(model, xtrain, ytrain, cv = cv, scoring = 'accuracy').mean())


[0.81147541 1.30263158]


cross val score on test set =  0.7705039525691699
cross val score on train set =  0.7907303370786518


hyperparameter tuning class weights

In [45]:
model = LogisticRegression()

params = {
    'class_weight' : [{0 : 100, 1 : 1}, {0 : 10, 1 : 1}, {0 : 1, 1 : 1}, {0 : 1, 1 : 10}, {0 : 1, 1 : 100}]
}

cv = StratifiedKFold(n_splits=8, shuffle=True)
gs = GridSearchCV(model, param_grid=params, cv = cv, scoring='accuracy')
result = gs.fit(xtrain, ytrain)

In [46]:
result.best_score_

0.8061797752808988

In [47]:
result.best_params_

{'class_weight': {0: 1, 1: 1}}

In [50]:
for i, j in zip(result.cv_results_['params'], result.cv_results_['mean_test_score']) : 
    print(i,"\t mean score = ", j)

{'class_weight': {0: 100, 1: 1}} 	 mean score =  0.6151685393258427
{'class_weight': {0: 10, 1: 1}} 	 mean score =  0.726123595505618
{'class_weight': {0: 1, 1: 1}} 	 mean score =  0.8061797752808988
{'class_weight': {0: 1, 1: 10}} 	 mean score =  0.5168539325842697
{'class_weight': {0: 1, 1: 100}} 	 mean score =  0.3946629213483146
