In [148]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

## Importing data into pandas dataframe

In [149]:
train_df = pd.read_csv("data/train.csv")
test_df = pd.read_csv("data/test.csv")

In [150]:
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [151]:
test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Splitting the ticket info to two categories and normalizing the name

In [152]:
processed_train_df = pd.DataFrame()
processed_test_df = pd.DataFrame()

def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])
    
    def ticket_number(x):
        return x.split(" ")[-1]
        
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)                     
    return df
    
processed_train_df = preprocess(train_df)
processed_test_df = preprocess(test_df)

In [153]:
processed_train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,NONE
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,NONE


In [154]:
processed_test_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,892,3,Kelly Mr James,male,34.5,0,0,330911,7.8292,,Q,330911,NONE
1,893,3,Wilkes Mrs James Ellen Needs,female,47.0,1,0,363272,7.0,,S,363272,NONE
2,894,2,Myles Mr Thomas Francis,male,62.0,0,0,240276,9.6875,,Q,240276,NONE
3,895,3,Wirz Mr Albert,male,27.0,0,0,315154,8.6625,,S,315154,NONE
4,896,3,Hirvonen Mrs Alexander Helga E Lindqvist,female,22.0,1,1,3101298,12.2875,,S,3101298,NONE


now we have 2 dataframes for training and testing the model

## Preparing the train dataset

In [155]:
train_data = processed_train_df.drop(columns=["PassengerId", "Survived", "Ticket"], inplace=False)
train_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,3,Braund Mr Owen Harris,male,22.0,1,0,7.25,,S,21171,A/5
1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,71.2833,C85,C,17599,PC
2,3,Heikkinen Miss Laina,female,26.0,0,0,7.925,,S,3101282,STON/O2.
3,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,53.1,C123,S,113803,NONE
4,3,Allen Mr William Henry,male,35.0,0,0,8.05,,S,373450,NONE


In [156]:
dummies = pd.get_dummies(train_data["Sex"])
train_data = pd.concat([train_data, dummies], axis=1)
train_data.drop(["Sex"], axis=1, inplace=True)
train_data.head()

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_number,Ticket_item,female,male
0,3,Braund Mr Owen Harris,22.0,1,0,7.25,,S,21171,A/5,False,True
1,1,Cumings Mrs John Bradley Florence Briggs Thayer,38.0,1,0,71.2833,C85,C,17599,PC,True,False
2,3,Heikkinen Miss Laina,26.0,0,0,7.925,,S,3101282,STON/O2.,True,False
3,1,Futrelle Mrs Jacques Heath Lily May Peel,35.0,1,0,53.1,C123,S,113803,NONE,True,False
4,3,Allen Mr William Henry,35.0,0,0,8.05,,S,373450,NONE,False,True


In [157]:
dummies = pd.get_dummies(train_data["Embarked"])
train_data = pd.concat([train_data, dummies], axis=1)
train_data.drop("Embarked", axis=1, inplace=True)
train_data.head()

Unnamed: 0,Pclass,Name,Age,SibSp,Parch,Fare,Cabin,Ticket_number,Ticket_item,female,male,C,Q,S
0,3,Braund Mr Owen Harris,22.0,1,0,7.25,,21171,A/5,False,True,False,False,True
1,1,Cumings Mrs John Bradley Florence Briggs Thayer,38.0,1,0,71.2833,C85,17599,PC,True,False,True,False,False
2,3,Heikkinen Miss Laina,26.0,0,0,7.925,,3101282,STON/O2.,True,False,False,False,True
3,1,Futrelle Mrs Jacques Heath Lily May Peel,35.0,1,0,53.1,C123,113803,NONE,True,False,False,False,True
4,3,Allen Mr William Henry,35.0,0,0,8.05,,373450,NONE,False,True,False,False,True


In [158]:
train_data["male"] = train_data["male"].astype(int)
train_data["female"] = train_data["female"].astype(int)
train_data["C"] = train_data["C"].astype(int)
train_data["Q"] = train_data["Q"].astype(int)
train_data["S"] = train_data["S"].astype(int)


In [159]:
def count_cabins(cabin):
    if pd.isna(cabin):  
        return 0
    else:
        return cabin.count(' ') + 1  
    
train_data['num_cabins'] = train_data['Cabin'].apply(count_cabins)

In [160]:
def has_ticket_item(item):
    if item == "NONE":
        return 0
    else:
        return 1
    
train_data['has_ticket_item'] = train_data['Ticket_item'].apply(has_ticket_item)

In [161]:
def ticket_number_to_int(x):
    if x == "LINE":
        return 0
    else:
        return int(x)
    
train_data["Ticket_number"] = train_data["Ticket_number"].apply(ticket_number_to_int)
train_data["Ticket_number"] = train_data["Ticket_number"].astype(int)

In [162]:
for column in train_data.columns:
    if train_data[column].dtype == 'int64':
        train_data[column] = train_data[column].astype('float64')

### Dropping the name

In [163]:
train_data.drop(["Name"], inplace= True, axis= 1)

In [164]:
train_data = train_data[["Pclass", "female", "male", "Age", "SibSp", "Parch", "Fare", "num_cabins", "C", "Q", "S", "Ticket_number", "has_ticket_item"]]

Replacing the Nan values in age columns

In [165]:
nan_count = train_data.isna().sum()

print("Number of NaN values in each column:")
print(nan_count)

Number of NaN values in each column:
Pclass               0
female               0
male                 0
Age                177
SibSp                0
Parch                0
Fare                 0
num_cabins           0
C                    0
Q                    0
S                    0
Ticket_number        0
has_ticket_item      0
dtype: int64


In [166]:
train_data.fillna(train_data["Age"].mean(), inplace=True)

# Results

In [167]:
train_data.head()

Unnamed: 0,Pclass,female,male,Age,SibSp,Parch,Fare,num_cabins,C,Q,S,Ticket_number,has_ticket_item
0,3.0,0.0,1.0,22.0,1.0,0.0,7.25,0.0,0.0,0.0,1.0,21171.0,1.0
1,1.0,1.0,0.0,38.0,1.0,0.0,71.2833,1.0,1.0,0.0,0.0,17599.0,1.0
2,3.0,1.0,0.0,26.0,0.0,0.0,7.925,0.0,0.0,0.0,1.0,3101282.0,1.0
3,1.0,1.0,0.0,35.0,1.0,0.0,53.1,1.0,0.0,0.0,1.0,113803.0,0.0
4,3.0,0.0,1.0,35.0,0.0,0.0,8.05,0.0,0.0,0.0,1.0,373450.0,0.0


In [168]:
train_data.dtypes

Pclass             float64
female             float64
male               float64
Age                float64
SibSp              float64
Parch              float64
Fare               float64
num_cabins         float64
C                  float64
Q                  float64
S                  float64
Ticket_number      float64
has_ticket_item    float64
dtype: object

In [169]:
train_data_survived = processed_train_df["Survived"]
train_data_survived = train_data_survived.astype("int64")

train_data_survived.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

# Now, we can create the dataset and model

In [170]:
def survival_model(features, target):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4)
    
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    model = LogisticRegression()
    model.fit(X_train_scaled, y_train)
    
    y_pred = model.predict(X_test_scaled)
    
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print("Accuracy:", accuracy)
    print("Classification Report:\n", report)
    
    return model

In [171]:

model = survival_model(train_data.values, train_data_survived)

Accuracy: 0.7871148459383753
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.87      0.83       218
           1       0.76      0.66      0.71       139

    accuracy                           0.79       357
   macro avg       0.78      0.76      0.77       357
weighted avg       0.79      0.79      0.78       357



# Testing the accuracy of model with unseen data (test_df)

### Applying same steps for the test data

In [172]:
test_data = processed_test_df.drop(columns=["PassengerId", "Ticket"], inplace= False)
test_data.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Ticket_number,Ticket_item
0,3,Kelly Mr James,male,34.5,0,0,7.8292,,Q,330911,NONE
1,3,Wilkes Mrs James Ellen Needs,female,47.0,1,0,7.0,,S,363272,NONE
2,2,Myles Mr Thomas Francis,male,62.0,0,0,9.6875,,Q,240276,NONE
3,3,Wirz Mr Albert,male,27.0,0,0,8.6625,,S,315154,NONE
4,3,Hirvonen Mrs Alexander Helga E Lindqvist,female,22.0,1,1,12.2875,,S,3101298,NONE


In [173]:
dummies = pd.get_dummies(test_data["Sex"])
test_data = pd.concat([test_data, dummies], axis=1)
test_data.drop(["Sex"], axis=1, inplace=True)

dummies = pd.get_dummies(test_data["Embarked"])
test_data = pd.concat([test_data, dummies], axis=1)
test_data.drop("Embarked", axis=1, inplace=True)

test_data["male"] = test_data["male"].astype(int)
test_data["female"] = test_data["female"].astype(int)
test_data["C"] = test_data["C"].astype(int)
test_data["Q"] = test_data["Q"].astype(int)
test_data["S"] = test_data["S"].astype(int)

def count_cabins(cabin):
    if pd.isna(cabin):  
        return 0
    else:
        return cabin.count(' ') + 1  
    
test_data['num_cabins'] = test_data['Cabin'].apply(count_cabins)

def has_ticket_item(item):
    if item == "NONE":
        return 0
    else:
        return 1
    
test_data['has_ticket_item'] = test_data['Ticket_item'].apply(has_ticket_item)

def ticket_number_to_int(x):
    if x == "LINE":
        return 0
    else:
        return int(x)
    
test_data["Ticket_number"] = test_data["Ticket_number"].apply(ticket_number_to_int)
test_data["Ticket_number"] = test_data["Ticket_number"].astype(int)

for column in test_data.columns:
    if test_data[column].dtype == 'int64':
        test_data[column] = test_data[column].astype('float64')

name_data = test_data["Name"]
test_data.drop(["Name"], inplace= True, axis= 1)

test_data.fillna(train_data["Age"].mean(), inplace=True)

In [174]:
test_data = test_data[["Pclass", "female", "male", "Age", "SibSp", "Parch", "Fare", "num_cabins", "C", "Q", "S", "Ticket_number", "has_ticket_item"]]
test_data.head()

Unnamed: 0,Pclass,female,male,Age,SibSp,Parch,Fare,num_cabins,C,Q,S,Ticket_number,has_ticket_item
0,3.0,0.0,1.0,34.5,0.0,0.0,7.8292,0.0,0.0,1.0,0.0,330911.0,0.0
1,3.0,1.0,0.0,47.0,1.0,0.0,7.0,0.0,0.0,0.0,1.0,363272.0,0.0
2,2.0,0.0,1.0,62.0,0.0,0.0,9.6875,0.0,0.0,1.0,0.0,240276.0,0.0
3,3.0,0.0,1.0,27.0,0.0,0.0,8.6625,0.0,0.0,0.0,1.0,315154.0,0.0
4,3.0,1.0,0.0,22.0,1.0,1.0,12.2875,0.0,0.0,0.0,1.0,3101298.0,0.0


In [175]:
predictions = model.predict(test_data.values)

for name, prediction in zip(name_data, predictions):
    print(f"name: {name[:10]}..., survival prediction {prediction}")


name: Kelly Mr J..., survival prediction 0
name: Wilkes Mrs..., survival prediction 0
name: Myles Mr T..., survival prediction 0
name: Wirz Mr Al..., survival prediction 0
name: Hirvonen M..., survival prediction 0
name: Svensson M..., survival prediction 0
name: Connolly M..., survival prediction 0
name: Caldwell M..., survival prediction 0
name: Abrahim Mr..., survival prediction 0
name: Davies Mr ..., survival prediction 0
name: Ilieff Mr ..., survival prediction 0
name: Jones Mr C..., survival prediction 0
name: Snyder Mrs..., survival prediction 0
name: Howard Mr ..., survival prediction 0
name: Chaffee Mr..., survival prediction 0
name: del Carlo ..., survival prediction 0
name: Keane Mr D..., survival prediction 0
name: Assaf Mr G..., survival prediction 0
name: Ilmakangas..., survival prediction 0
name: Assaf Khal..., survival prediction 0
name: Rothschild..., survival prediction 0
name: Olsen Mast..., survival prediction 0
name: Flegenheim..., survival prediction 0
name: Willi

In [176]:
save_df = pd.DataFrame(predictions)
save_df.to_csv('predictions.csv', index=False)