In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

data = pd.read_csv('titanic.csv')

In [2]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
data['PassengerId'].unique()

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [5]:
data.drop(['PassengerId'],axis =1,inplace = True)

In [6]:
data.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
data['Pclass'].unique()

array([3, 1, 2], dtype=int64)

In [8]:
data.drop(['Name'],axis =1 ,inplace = True)

In [9]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,female,35.0,1,0,113803,53.1,C123,S
4,0,3,male,35.0,0,0,373450,8.05,,S


In [10]:
data.drop(['Ticket'],axis =1,inplace = True)

In [11]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
data['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [14]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [15]:
data.dropna(subset=['Embarked'], inplace=True)

In [16]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      0
dtype: int64

In [17]:
data['Cabin']= data['Cabin'].fillna(data['Cabin'].mode()[0])

In [18]:
data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin         0
Embarked      0
dtype: int64

In [19]:
data['Age']= data['Age'].fillna(data['Age'].mean())

In [20]:
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

In [21]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,B96 B98,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,B96 B98,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,B96 B98,S


In [22]:
label = LabelEncoder()
data['Sex'] = label.fit_transform(data['Sex'])

In [23]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,B96 B98,S
1,1,1,0,38.0,1,0,71.2833,C85,C
2,1,3,0,26.0,0,0,7.925,B96 B98,S
3,1,1,0,35.0,1,0,53.1,C123,S
4,0,3,1,35.0,0,0,8.05,B96 B98,S


In [24]:
data['Embarked'] = label.fit_transform(data['Embarked'])

In [25]:
data.tail()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
886,0,2,1,27.0,0,0,13.0,B96 B98,2
887,1,1,0,19.0,0,0,30.0,B42,2
888,0,3,0,29.642093,1,2,23.45,B96 B98,2
889,1,1,1,26.0,0,0,30.0,C148,0
890,0,3,1,32.0,0,0,7.75,B96 B98,1


In [26]:
data['Cabin'].unique()

array(['B96 B98', 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'C83', 'F33', 'F G73',
       'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101', 'F E69',
       'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4', 'A32',
       'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35', 'C87',
       'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19', 'B49',
       'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54', 'B57 B59 B63 B66',
       'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40', 'T', 'C128',
       'D37', 'B35', 'E50', 'C82', 'E10', 'E44', 'A34', 'C104', 'C111',
       'C92', 'E38', 'D21', 'E12', 'E63', 'A14', 'B37', 'C30', 'D20',
       'B79', 'E25', 'D46', 'B73', 'C95', 'B38', 'B39', 'B22', 'C86',
       'C70', 'A16', 'C101', 'C68', 'A10', 'E68', 'B41', 'A20', 'D19',
       'D50', 'D9', 'A23', 'B50', 'A26', 'D48', 'E58', 'C126', 'B71',
       'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63', 'C62 C64', 'E24',
       'C90

In [27]:
data['Cabin'] = data['Cabin'].astype(str).str[0]

In [28]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,B,2
1,1,1,0,38.0,1,0,71.2833,C,0
2,1,3,0,26.0,0,0,7.925,B,2
3,1,1,0,35.0,1,0,53.1,C,2
4,0,3,1,35.0,0,0,8.05,B,2


In [29]:
data['Cabin'].unique()

array(['B', 'C', 'E', 'G', 'D', 'A', 'F', 'T'], dtype=object)

In [30]:
data['Cabin'].value_counts()['T']

1

In [31]:
data.index[data['Cabin']=='T'].tolist()

[339]

In [32]:
data = data[data['Cabin'] != 'T']

In [33]:
data['Cabin'].unique()

array(['B', 'C', 'E', 'G', 'D', 'A', 'F'], dtype=object)

In [34]:
data['Cabin'].value_counts()

Cabin
B    732
C     59
D     33
E     32
A     15
F     13
G      4
Name: count, dtype: int64

In [35]:
data = data[data['Cabin'] != 'G']

In [36]:
data['Cabin'].value_counts()

Cabin
B    732
C     59
D     33
E     32
A     15
F     13
Name: count, dtype: int64

In [37]:
data['Cabin'] = label.fit_transform(data['Cabin'])

In [38]:
data['Cabin'].value_counts()

Cabin
1    732
2     59
3     33
4     32
0     15
5     13
Name: count, dtype: int64

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 884 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  884 non-null    int64  
 1   Pclass    884 non-null    int64  
 2   Sex       884 non-null    int32  
 3   Age       884 non-null    float64
 4   SibSp     884 non-null    int64  
 5   Parch     884 non-null    int64  
 6   Fare      884 non-null    float64
 7   Cabin     884 non-null    int32  
 8   Embarked  884 non-null    int32  
dtypes: float64(2), int32(3), int64(4)
memory usage: 58.7 KB


In [40]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,1,22.0,1,0,7.25,1,2
1,1,1,0,38.0,1,0,71.2833,2,0
2,1,3,0,26.0,0,0,7.925,1,2
3,1,1,0,35.0,1,0,53.1,2,2
4,0,3,1,35.0,0,0,8.05,1,2


In [41]:
data.drop(['Parch','Embarked'],axis =1 ,inplace=True)

In [42]:
import numpy as np
data['Fare'] = np.log1p(data['Fare'])

In [43]:
X = data.drop(['Survived'],axis = 1)
Y = data['Survived']

In [44]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size = 0.2,random_state=42)

In [45]:
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Cabin
410,3,1,29.642093,0,2.185579,1
440,2,0,45.000000,1,3.305054,1
365,3,1,30.000000,0,2.110213,1
360,3,1,40.000000,1,3.363842,1
83,1,1,28.000000,0,3.873282,1
...,...,...,...,...,...,...
108,3,1,38.000000,0,2.185579,1
274,3,0,29.642093,0,2.169054,1
867,1,1,31.000000,0,3.941500,0
441,3,1,20.000000,0,2.351375,1


In [46]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [47]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [60]:
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

In [61]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, log_loss, roc_auc_score

In [62]:
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
logloss = log_loss(Y_test, y_prob)
roc_auc = roc_auc_score(Y_test, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"Log Loss: {logloss:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}")

Accuracy: 0.8136
Precision: 0.7375
Recall: 0.8310
F1-score: 0.7815
Log Loss: 0.4286
ROC-AUC Score: 0.8771


In [63]:
def predict_survival(model, input_features):
    input_array = np.array(input_features).reshape(1, -1)
    prediction = model.predict(input_array)[0]
    return "Survived" if prediction == 1 else "Not Survived"

In [67]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Fare,Cabin
0,3,1,22.0,1,2.110213,1
1,1,0,38.0,1,4.280593,2
2,3,0,26.0,0,2.188856,1
3,1,0,35.0,1,3.990834,2
4,3,1,35.0,0,2.202765,1


In [66]:
X.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Cabin'], dtype='object')

In [70]:
survivor_features = [1, 0, 25, 1, 100, 1] 
result = predict_survival(model, survivor_features)
result

'Survived'