In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from warnings import filterwarnings
filterwarnings(action='ignore')

In [22]:
pd.set_option('display.max_columns',10,'display.width',1000)
#Training data
train = pd.read_csv(r"C:\Users\Ehsaan\Downloads\titanic\train.csv")
train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

#Testing data
test = pd.read_csv(r"C:\Users\Ehsaan\Downloads\titanic\test.csv")

train.head()

train['Sex'] = train['Sex'].map({'male': 0, 'female': 1}).astype(int)
test['Sex'] = test['Sex'].map({'male': 0, 'female': 1}).astype(int)

In [23]:
train.shape

(891, 8)

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  889 non-null    object 
dtypes: float64(2), int32(1), int64(4), object(1)
memory usage: 52.3+ KB


In [25]:
train.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [26]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [27]:
#Description of dataset
train.describe(include="all")

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0,889
unique,,,,,,,,3
top,,,,,,,,S
freq,,,,,,,,644
mean,0.383838,2.308642,0.352413,29.699118,0.523008,0.381594,32.204208,
std,0.486592,0.836071,0.47799,14.526497,1.102743,0.806057,49.693429,
min,0.0,1.0,0.0,0.42,0.0,0.0,0.0,
25%,0.0,2.0,0.0,20.125,0.0,0.0,7.9104,
50%,0.0,3.0,0.0,28.0,0.0,0.0,14.4542,
75%,1.0,3.0,1.0,38.0,1.0,0.0,31.0,


In [28]:
train['Age'].fillna(train['Age'].median(), inplace=True)

In [29]:
print(train[['Sex', 'Pclass']].isnull().sum())

test['Age'].fillna(test['Age'].median(), inplace=True)
print(test[['Sex', 'Pclass']].isnull().sum())

Sex       0
Pclass    0
dtype: int64
Sex       0
Pclass    0
dtype: int64


In [30]:
features = ['Age','Sex','Pclass']

X = train[features]
y = train['Survived']

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

In [32]:
model = LogisticRegression(max_iter = 1000)

In [33]:
model.fit(X_train, y_train)

In [34]:
# Select the correct feature columns
features = ['Age', 'Sex', 'Pclass']

# Create a SimpleImputer to fill missing values in ALL columns
imputer = SimpleImputer(strategy='mean')

# Prepare the test data—ensure categorical columns like 'Sex' are encoded as numbers if not already done
X_test = test[features]

# Impute ALL columns with missing data
X_test_imputed = imputer.fit_transform(X_test)

# Now X_test_imputed is a NumPy array; if you need a DataFrame, convert it
X_test_imputed_df = pd.DataFrame(X_test_imputed, columns=features)

# Use X_test_imputed_df for prediction
final_preds = model.predict(X_test_imputed_df)


In [35]:
val_preds = model.predict(X_train)

In [36]:
print('Validation Accuracy:', accuracy_score(y_train, val_preds))

Validation Accuracy: 0.7879213483146067


In [37]:
final_preds = model.predict(test[['Age', 'Sex', 'Pclass']])

In [38]:
print(final_preds[:10])

[0 0 0 0 1 0 1 0 1 0]


In [39]:
output = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Sex' : test['Sex'],
    'Age' : test['Age'],
    'Survived': final_preds
})

In [40]:
print(output[:50])

    PassengerId  Sex   Age  Survived
0           892    0  34.5         0
1           893    1  47.0         0
2           894    0  62.0         0
3           895    0  27.0         0
4           896    1  22.0         1
5           897    0  14.0         0
6           898    1  30.0         1
7           899    0  26.0         0
8           900    1  18.0         1
9           901    0  21.0         0
10          902    0  27.0         0
11          903    0  46.0         0
12          904    1  23.0         1
13          905    0  63.0         0
14          906    1  47.0         1
15          907    1  24.0         1
16          908    0  35.0         0
17          909    0  21.0         0
18          910    1  27.0         1
19          911    1  45.0         0
20          912    0  55.0         0
21          913    0   9.0         0
22          914    1  27.0         1
23          915    0  21.0         1
24          916    1  48.0         1
25          917    0  50.0         0
2