In [48]:
import numpy as np
import pandas as pd
import os, json, re

from collections import Counter

In [49]:
df = pd.read_csv("data/train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [50]:
df.shape

(891, 12)

In [51]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [52]:
X = df.drop("Survived", axis = 1)
y = df["Survived"]

# data cleaning

In [53]:
X.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## gender to numeric

In [54]:
X["Sex"] = X["Sex"].map({'male': 1, "female": 0})
X.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,S


# name column has no use

In [55]:
X = X.drop("Name", axis = 1)
X.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,1,22.0,1,0,A/5 21171,7.25,,S
1,2,1,0,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,0,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,0,35.0,1,0,113803,53.1,C123,S
4,5,3,1,35.0,0,0,373450,8.05,,S


## embarked column to numeric

In [56]:
X["Embarked"] = X["Embarked"].map({"S": 3, "Q": 1, "C": 2})
X

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,1,22.0,1,0,A/5 21171,7.2500,,3.0
1,2,1,0,38.0,1,0,PC 17599,71.2833,C85,2.0
2,3,3,0,26.0,0,0,STON/O2. 3101282,7.9250,,3.0
3,4,1,0,35.0,1,0,113803,53.1000,C123,3.0
4,5,3,1,35.0,0,0,373450,8.0500,,3.0
...,...,...,...,...,...,...,...,...,...,...
886,887,2,1,27.0,0,0,211536,13.0000,,3.0
887,888,1,0,19.0,0,0,112053,30.0000,B42,3.0
888,889,3,0,,1,2,W./C. 6607,23.4500,,3.0
889,890,1,1,26.0,0,0,111369,30.0000,C148,2.0


# Cabin analysis

In [88]:
def process_cabin(c: str) -> tuple:
    if pd.isna(c):
        return None, None
    
    match = re.match(r"([A-Z])(\d+)$", c) # c29
    if match:
        alphabet_part = match.group(1)
        numeric_part = match.group(2)
        return alphabet_part, int(numeric_part) # c29 = c, 29
    
    else:
        if re.match(r"[A-Z]$", c): # single letter, C
            return c, 0
        
        else:
            if len(c.split(" ")) > 1: # C23 C25 C27 or F G29
                temp_cabin_class = []
                temp_cabin_number = []
                for sp in c.split(" "):
                    sp.strip()
                    cabin_class, cabin_number = process_cabin(sp)
                    temp_cabin_class.append(cabin_class)
                    temp_cabin_number.append(cabin_number)
                a_set = set(temp_cabin_class)
                # print(a_set)
                if len(a_set) == 1:
                    return a_set.pop(), int(np.median(temp_cabin_number))
                else:
                    return "z", temp_cabin_number[-1] # F G29

In [89]:
print(process_cabin("F E34"))
print(process_cabin("C23 C25 C27"))
print(process_cabin("C"))
print(process_cabin("C123"))

('z', 34)
('C', 25)
('C', 0)
('C', 123)


In [77]:
X["Cabin"].isna().sum(), X.shape

(687, (891, 10))

In [60]:
X["Pclass"].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [61]:
cabin_analysis_df = X.dropna(subset=["Cabin"]).copy()
cabin_analysis_df.shape

(204, 10)

In [62]:
cabin_setup = {}
for i in range(1, 4): #pclass
    all_cabin_classes = []
    pclass_df = cabin_analysis_df[cabin_analysis_df["Pclass"] == i]
    # cabins = pclass_df["Cabin"].value_counts().keys()
    pclass_cabins = pclass_df["Cabin"]
    # print(cabins)
    for cabin in pclass_cabins:
        cabin_class, _ = process_cabin(cabin)
        all_cabin_classes.append(cabin_class)
        
    cabin_setup[i] = Counter(all_cabin_classes)

In [63]:
cabin_setup

{1: Counter({'C': 59, 'E': 25, 'A': 15, 'B': 47, 'D': 29, 'T': 1}),
 2: Counter({'D': 4, 'F': 8, 'E': 4}),
 3: Counter({'G': 4, 'z': 4, 'E': 3, 'F': 1})}

In [64]:
all_c_c = []
for val in cabin_analysis_df["Cabin"]:
    c_c, _ = process_cabin(val)
    all_c_c.append(c_c)
cabin_analysis_df["cabin_class"] = all_c_c

In [65]:
cabin_analysis_df.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_class
1,2,1,0,38.0,1,0,PC 17599,71.2833,C85,2.0,C
3,4,1,0,35.0,1,0,113803,53.1,C123,3.0,C
6,7,1,1,54.0,0,0,17463,51.8625,E46,3.0,E
10,11,3,0,4.0,1,1,PP 9549,16.7,G6,3.0,G
11,12,1,0,58.0,0,0,113783,26.55,C103,3.0,C


In [67]:
cabin_analysis_df.groupby(["Pclass", "cabin_class"])["Fare"].mean()

Pclass  cabin_class
1       A               39.623887
        B              113.505764
        C              100.151341
        D               63.324286
        E               55.740168
        T               35.500000
2       D               13.166675
        E               11.587500
        F               23.750000
3       E               11.000000
        F                7.750000
        G               13.581250
        z               11.327075
Name: Fare, dtype: float64

In [68]:
[_ for _ in cabin_analysis_df["Cabin"] if len(_.split(" "))>1]

['C23 C25 C27',
 'F G73',
 'C23 C25 C27',
 'D10 D12',
 'B58 B60',
 'F E69',
 'C22 C26',
 'B58 B60',
 'C22 C26',
 'B57 B59 B63 B66',
 'C23 C25 C27',
 'B96 B98',
 'B96 B98',
 'C23 C25 C27',
 'C22 C26',
 'B51 B53 B55',
 'F G63',
 'C62 C64',
 'F G73',
 'B57 B59 B63 B66',
 'B96 B98',
 'B82 B84',
 'B96 B98',
 'B51 B53 B55']

In [83]:
def fill_missing_cabin_class(df, fare_col='Fare', pclass_col='Pclass', cabin_class_col='cabin_class'):
    """
    Fill missing cabin class values based on fare and pclass similarity to known patterns.
    """
    df_filled = df.copy()
    
    # Calculate mean fare for each pclass-cabin_class combination
    fare_means = df.groupby([pclass_col, cabin_class_col])[fare_col].mean()
    
    # Find rows with missing cabin class
    missing_mask = df[cabin_class_col].isna()
    
    # For each missing row, find the best match
    for idx in df[missing_mask].index:
        passenger_fare = df.loc[idx, fare_col]
        passenger_pclass = df.loc[idx, pclass_col]
        
        # Get all cabin classes for this pclass
        try:
            available_classes = fare_means[passenger_pclass].index
            
            # Calculate absolute difference between passenger fare and mean fare for each cabin class
            fare_differences = {}
            for cabin_class in available_classes:
                mean_fare = fare_means[passenger_pclass][cabin_class]
                fare_differences[cabin_class] = abs(passenger_fare - mean_fare)
            
            # Find cabin class with smallest fare difference
            if fare_differences:
                best_match = min(fare_differences, key=fare_differences.get)
                df_filled.loc[idx, cabin_class_col] = best_match
            
        except KeyError:
            # No data for this pclass, skip or assign default
            continue
    
    return df_filled

In [78]:
X['cabin_class'] = X['Cabin'].apply(process_cabin).apply(lambda x: x[0] if x is not None else None)

In [80]:
X.isna().sum()

PassengerId      0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
cabin_class    687
dtype: int64

In [85]:
X_filled = fill_missing_cabin_class(X)
X_filled

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_class
0,1,3,1,22.0,1,0,A/5 21171,7.2500,,3.0,F
1,2,1,0,38.0,1,0,PC 17599,71.2833,C85,2.0,C
2,3,3,0,26.0,0,0,STON/O2. 3101282,7.9250,,3.0,F
3,4,1,0,35.0,1,0,113803,53.1000,C123,3.0,C
4,5,3,1,35.0,0,0,373450,8.0500,,3.0,F
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,1,27.0,0,0,211536,13.0000,,3.0,D
887,888,1,0,19.0,0,0,112053,30.0000,B42,3.0,B
888,889,3,0,,1,2,W./C. 6607,23.4500,,3.0,G
889,890,1,1,26.0,0,0,111369,30.0000,C148,2.0,C


# cabin number

In [91]:
cabin_analysis_df["cabin_number"] = cabin_analysis_df["Cabin"]\
    .apply(process_cabin)\
        .apply(lambda x: x[1] if x is not None else None)

In [92]:
cabin_analysis_df

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_class,cabin_number
1,2,1,0,38.0,1,0,PC 17599,71.2833,C85,2.0,C,85
3,4,1,0,35.0,1,0,113803,53.1000,C123,3.0,C,123
6,7,1,1,54.0,0,0,17463,51.8625,E46,3.0,E,46
10,11,3,0,4.0,1,1,PP 9549,16.7000,G6,3.0,G,6
11,12,1,0,58.0,0,0,113783,26.5500,C103,3.0,C,103
...,...,...,...,...,...,...,...,...,...,...,...,...
871,872,1,0,47.0,1,1,11751,52.5542,D35,3.0,D,35
872,873,1,1,33.0,0,0,695,5.0000,B51 B53 B55,3.0,B,53
879,880,1,0,56.0,0,1,11767,83.1583,C50,2.0,C,50
887,888,1,0,19.0,0,0,112053,30.0000,B42,3.0,B,42


In [118]:
# sharing in pclass 1
target = cabin_analysis_df[cabin_analysis_df["Pclass"] == 1]
target_dict_count = target["cabin_number"].value_counts()
target_list = list(target_dict_count.to_dict().values())
len([x for x in target_list if x > 1])

50

In [119]:
len(target)

176

In [None]:
# with siblings or parents in pclass 1
cabin_analysis_df[(cabin_analysis_df["Pclass"] == 1) & ((cabin_analysis_df["SibSp"] > 0) | (cabin_analysis_df["Parch"] > 0))].shape

(98, 12)

In [124]:
X_filled["cabin_number"] = X_filled["Cabin"].apply(process_cabin).apply(lambda x: x[1] if x is not None else None)

In [129]:
X_filled.dtypes

PassengerId       int64
Pclass            int64
Sex               int64
Age             float64
SibSp             int64
Parch             int64
Ticket           object
Fare            float64
Cabin            object
Embarked        float64
cabin_class      object
cabin_number    float64
dtype: object

In [128]:
X_filled

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,cabin_class,cabin_number
0,1,3,1,22.0,1,0,A/5 21171,7.2500,,3.0,F,
1,2,1,0,38.0,1,0,PC 17599,71.2833,C85,2.0,C,85.0
2,3,3,0,26.0,0,0,STON/O2. 3101282,7.9250,,3.0,F,
3,4,1,0,35.0,1,0,113803,53.1000,C123,3.0,C,123.0
4,5,3,1,35.0,0,0,373450,8.0500,,3.0,F,
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,1,27.0,0,0,211536,13.0000,,3.0,D,
887,888,1,0,19.0,0,0,112053,30.0000,B42,3.0,B,42.0
888,889,3,0,,1,2,W./C. 6607,23.4500,,3.0,G,
889,890,1,1,26.0,0,0,111369,30.0000,C148,2.0,C,148.0


In [None]:
import random

def assign_missing_cabins_by_pclass_range(df, pclass_col='Pclass', cabin_col='cabin_number'):
    """
    Assign missing cabin numbers randomly within each Pclass's existing range.
    Only fills NaN values, leaves existing values unchanged.
    """
    df_filled = df.copy()
    
    # Get cabin number ranges for each Pclass (only non-NaN values)
    pclass_ranges = {}
    
    for pclass in df[pclass_col].unique():
        # Get existing cabin numbers for this Pclass (excluding NaN)
        pclass_cabins = df[(df[pclass_col] == pclass) & (df[cabin_col].notna())][cabin_col]
        
        if len(pclass_cabins) > 0:
            # Since cabin_number is float64, directly get min/max
            pclass_ranges[pclass] = (int(pclass_cabins.min()), int(pclass_cabins.max()))
    
    print("Cabin ranges by Pclass:")
    for pclass, (min_cabin, max_cabin) in pclass_ranges.items():
        print(f"Pclass {pclass}: {min_cabin} - {max_cabin}")
    
    # Assign missing values only
    for pclass in pclass_ranges.keys():
        # Get rows with missing cabin numbers for this Pclass
        missing_mask = (df_filled[pclass_col] == pclass) & (df_filled[cabin_col].isna())
        missing_indices = df_filled[missing_mask].index
        
        if len(missing_indices) > 0:
            min_cabin, max_cabin = pclass_ranges[pclass]
            
            # Generate random cabin numbers within range
            random_cabins = [float(random.randint(min_cabin, max_cabin)) for _ in range(len(missing_indices))]
            
            # Assign the random cabin numbers (keep as float to match dtype)
            df_filled.loc[missing_indices, cabin_col] = random_cabins
            
            print(f"Assigned {len(missing_indices)} missing cabins for Pclass {pclass}")
    
    return df_filled

df_with_cabins = assign_missing_cabins_by_pclass_range(X_filled)

print("\nBefore:")
print(f"Missing cabin_number: {X_filled['cabin_number'].isna().sum()}")
print("\nAfter:")
print(f"Missing cabin_number: {df_with_cabins['cabin_number'].isna().sum()}")

Cabin ranges by Pclass:
Pclass 3: 6 - 121
Pclass 1: 0 - 148
Pclass 2: 0 - 101
Assigned 479 missing cabins for Pclass 3
Assigned 40 missing cabins for Pclass 1
Assigned 168 missing cabins for Pclass 2

Before:
Missing cabin_number: 687

After:
Missing cabin_number: 0


In [133]:
df_with_cabins.drop("Cabin", axis=1, inplace=True)

In [134]:
df_with_cabins

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,cabin_class,cabin_number
0,1,3,1,22.0,1,0,A/5 21171,7.2500,3.0,F,6.0
1,2,1,0,38.0,1,0,PC 17599,71.2833,2.0,C,85.0
2,3,3,0,26.0,0,0,STON/O2. 3101282,7.9250,3.0,F,44.0
3,4,1,0,35.0,1,0,113803,53.1000,3.0,C,123.0
4,5,3,1,35.0,0,0,373450,8.0500,3.0,F,15.0
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,1,27.0,0,0,211536,13.0000,3.0,D,36.0
887,888,1,0,19.0,0,0,112053,30.0000,3.0,B,42.0
888,889,3,0,,1,2,W./C. 6607,23.4500,3.0,G,75.0
889,890,1,1,26.0,0,0,111369,30.0000,2.0,C,148.0


In [135]:
df_with_cabins['cabin_class'] = pd.Categorical(df_with_cabins['cabin_class']).codes

In [137]:
df_with_cabins.drop("PassengerId", axis=1, inplace=True)

In [138]:
df_with_cabins

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,cabin_class,cabin_number
0,3,1,22.0,1,0,A/5 21171,7.2500,3.0,5,6.0
1,1,0,38.0,1,0,PC 17599,71.2833,2.0,2,85.0
2,3,0,26.0,0,0,STON/O2. 3101282,7.9250,3.0,5,44.0
3,1,0,35.0,1,0,113803,53.1000,3.0,2,123.0
4,3,1,35.0,0,0,373450,8.0500,3.0,5,15.0
...,...,...,...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,211536,13.0000,3.0,3,36.0
887,1,0,19.0,0,0,112053,30.0000,3.0,1,42.0
888,3,0,,1,2,W./C. 6607,23.4500,3.0,6,75.0
889,1,1,26.0,0,0,111369,30.0000,2.0,2,148.0


## family member

In [None]:
df_with_cabins["family_members"] = df_with_cabins["SibSp"] + df_with_cabins["Parch"]
# Some children travelled only with a nanny, therefore parch=0 for them.

## ticket

In [140]:
[ticket_detail for ticket_detail in df_with_cabins["Ticket"] if len(ticket_detail.split(" ")) > 1]

['A/5 21171',
 'PC 17599',
 'STON/O2. 3101282',
 'PP 9549',
 'A/5. 2151',
 'PC 17601',
 'PC 17569',
 'C.A. 24579',
 'PC 17604',
 'A./5. 2152',
 'SC/Paris 2123',
 'S.C./A.4. 23567',
 'A/4. 39886',
 'PC 17572',
 'C.A. 31026',
 'C.A. 34651',
 'CA 2144',
 'PC 17605',
 'C.A. 29395',
 'S.P. 3464',
 'C.A. 33111',
 'CA 2144',
 'S.O.C. 14879',
 'SO/C 14885',
 'W./C. 6608',
 'SOTON/OQ 392086',
 'W.E.P. 5734',
 'C.A. 2315',
 'PC 17754',
 'PC 17759',
 'STON/O 2. 3101294',
 'PC 17558',
 'S.O.C. 14879',
 'A4. 54510',
 'C 17369',
 'SOTON/O.Q. 3101307',
 'A/5. 3337',
 'C.A. 29178',
 'SC/PARIS 2133',
 'PC 17593',
 'STON/O2. 3101279',
 'C.A. 33112',
 'W./C. 6608',
 'S.O.P. 1166',
 'A.5. 11206',
 'A/5. 851',
 'Fa 265302',
 'PC 17597',
 'SOTON/OQ 392090',
 'CA. 2343',
 'C.A. 33595',
 'PC 17318',
 'STON/O 2. 3101280',
 'PC 17595',
 'CA. 2343',
 'SC/PARIS 2131',
 'PC 17610',
 'PC 17569',
 'CA. 2343',
 'A/5 3540',
 'SOTON/O.Q. 3101311',
 'F.C.C. 13528',
 'A/5 21174',
 'STON/O2. 3101283',
 'W/C 14208',
 'SOTO

In [142]:
df_with_cabins["Ticket"] = df_with_cabins["Ticket"].apply(lambda x: x.split(" ")[0] if len(x.split(" "))>1 else "general")

In [143]:
df_with_cabins["Ticket"] = pd.Categorical(df_with_cabins["Ticket"]).codes

In [146]:
df_with_cabins.dtypes

Pclass              int64
Sex                 int64
Age               float64
SibSp               int64
Parch               int64
Ticket               int8
Fare              float64
Embarked          float64
cabin_class          int8
cabin_number      float64
family_members      int64
dtype: object

In [147]:
df_with_cabins.isna().sum()

Pclass              0
Sex                 0
Age               177
SibSp               0
Parch               0
Ticket              0
Fare                0
Embarked            2
cabin_class         0
cabin_number        0
family_members      0
dtype: int64

In [151]:
temp = df.copy()
len(temp["Ticket"].unique()), temp.shape

(681, (891, 12))

In [159]:
common_tickets = []
for i, ticket_tuple in enumerate(temp["Ticket"].value_counts().items()):
    if ticket_tuple[1] > 1:
        print(f"{i}. Ticket: {ticket_tuple[0]}, Count: {ticket_tuple[1]}")
        common_tickets.append(ticket_tuple[0])

0. Ticket: 347082, Count: 7
1. Ticket: CA. 2343, Count: 7
2. Ticket: 1601, Count: 7
3. Ticket: 3101295, Count: 6
4. Ticket: CA 2144, Count: 6
5. Ticket: 347088, Count: 6
6. Ticket: S.O.C. 14879, Count: 5
7. Ticket: 382652, Count: 5
8. Ticket: LINE, Count: 4
9. Ticket: PC 17757, Count: 4
10. Ticket: 17421, Count: 4
11. Ticket: 349909, Count: 4
12. Ticket: 113760, Count: 4
13. Ticket: 4133, Count: 4
14. Ticket: 113781, Count: 4
15. Ticket: W./C. 6608, Count: 4
16. Ticket: 2666, Count: 4
17. Ticket: 19950, Count: 4
18. Ticket: 347077, Count: 4
19. Ticket: C.A. 31921, Count: 3
20. Ticket: PC 17572, Count: 3
21. Ticket: C.A. 34651, Count: 3
22. Ticket: 363291, Count: 3
23. Ticket: F.C.C. 13529, Count: 3
24. Ticket: 345773, Count: 3
25. Ticket: 248727, Count: 3
26. Ticket: 24160, Count: 3
27. Ticket: 29106, Count: 3
28. Ticket: SC/Paris 2123, Count: 3
29. Ticket: 35273, Count: 3
30. Ticket: 371110, Count: 3
31. Ticket: 230080, Count: 3
32. Ticket: PC 17760, Count: 3
33. Ticket: 239853, Count

In [166]:
for ticket in common_tickets:
    # print(f"Ticket: {ticket}")
    common_price_list = temp[temp["Ticket"] == ticket]["Fare"]
    print(f"Ticket: {ticket}, Common Prices: {common_price_list.tolist()}")
    if len(set(common_price_list)) > 1:
        print(f"Different fares for ticket {ticket}: {set(common_price_list)}")

Ticket: 347082, Common Prices: [31.275, 31.275, 31.275, 31.275, 31.275, 31.275, 31.275]
Ticket: CA. 2343, Common Prices: [69.55, 69.55, 69.55, 69.55, 69.55, 69.55, 69.55]
Ticket: 1601, Common Prices: [56.4958, 56.4958, 56.4958, 56.4958, 56.4958, 56.4958, 56.4958]
Ticket: 3101295, Common Prices: [39.6875, 39.6875, 39.6875, 39.6875, 39.6875, 39.6875]
Ticket: CA 2144, Common Prices: [46.9, 46.9, 46.9, 46.9, 46.9, 46.9]
Ticket: 347088, Common Prices: [27.9, 27.9, 27.9, 27.9, 27.9, 27.9]
Ticket: S.O.C. 14879, Common Prices: [73.5, 73.5, 73.5, 73.5, 73.5]
Ticket: 382652, Common Prices: [29.125, 29.125, 29.125, 29.125, 29.125]
Ticket: LINE, Common Prices: [0.0, 0.0, 0.0, 0.0]
Ticket: PC 17757, Common Prices: [227.525, 227.525, 227.525, 227.525]
Ticket: 17421, Common Prices: [110.8833, 110.8833, 110.8833, 110.8833]
Ticket: 349909, Common Prices: [21.075, 21.075, 21.075, 21.075]
Ticket: 113760, Common Prices: [120.0, 120.0, 120.0, 120.0]
Ticket: 4133, Common Prices: [25.4667, 25.4667, 25.4667, 

In [169]:
df_with_cabins["old_Ticket_column"] = temp["Ticket"]

In [171]:
df_with_cabins[df_with_cabins["old_Ticket_column"] == "347082"]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,cabin_class,cabin_number,family_members,old_Ticket_column
13,3,1,39.0,1,5,42,31.275,3.0,6,44.0,6,347082
119,3,0,2.0,4,2,42,31.275,3.0,6,61.0,6,347082
541,3,0,9.0,4,2,42,31.275,3.0,6,72.0,6,347082
542,3,0,11.0,4,2,42,31.275,3.0,6,55.0,6,347082
610,3,0,39.0,1,5,42,31.275,3.0,6,53.0,6,347082
813,3,0,6.0,4,2,42,31.275,3.0,6,65.0,6,347082
850,3,1,4.0,4,2,42,31.275,3.0,6,64.0,6,347082


In [172]:
df_with_cabins.isna().sum()

Pclass                 0
Sex                    0
Age                  177
SibSp                  0
Parch                  0
Ticket                 0
Fare                   0
Embarked               2
cabin_class            0
cabin_number           0
family_members         0
old_Ticket_column      0
dtype: int64

# Age

In [176]:
age_analysis = temp[temp["Age"].isna() & temp["SibSp"] + temp["Parch"] > 0]
age_analysis["Ticket"].value_counts()

Ticket
376564        2
371110        2
W./C. 6607    2
370365        2
367230        2
PC 17569      1
19996         1
2689          1
65304         1
17464         1
65303         1
386525        1
PC 17604      1
370371        1
PC 17611      1
2665          1
367229        1
113505        1
17453         1
Name: count, dtype: int64

In [177]:
age_analysis

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
31,32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
46,47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q
109,110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q
166,167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55.0,E33,S
186,187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q
214,215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q
240,241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C
241,242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q
334,335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinshei...",female,,1,0,PC 17611,133.65,,S
347,348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S


In [193]:
df[(df["Pclass"] == 3) & (df["Fare"] >= 15.0) & (df["Fare"] <= 20.0) & (df["Name"].str.contains("Mrs")) & (df["Parch"] == 0)]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
18,19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S
49,50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,,S
85,86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gu...",female,33.0,3,0,3101278,15.85,,S
142,143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda ...",female,24.0,1,0,STON/O2. 3101279,15.85,,S
186,187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q
347,348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S
431,432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S
559,560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36.0,1,0,345572,17.4,,S
617,618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26.0,1,0,A/5. 3336,16.1,,S


In [198]:
df.groupby("Pclass")["Age"].mean()

Pclass
1    38.233441
2    29.877630
3    25.140620
Name: Age, dtype: float64

In [238]:
# Fill missing age values with mean age by Pclass
df_with_cabins['Age'] = df_with_cabins.groupby('Pclass')['Age'].transform(lambda x: x.fillna(x.mean()))

# embarked

In [228]:
df_with_cabins[df_with_cabins["Embarked"].isna()]["Embarked"]

61    NaN
829   NaN
Name: Embarked, dtype: float64

In [233]:
df_with_cabins.loc[df_with_cabins["Embarked"].isna(), "Embarked"] = 3  # S

In [239]:
df_with_cabins.isna().sum()

Pclass               0
Sex                  0
Age                  0
SibSp                0
Parch                0
Ticket               0
Fare                 0
Embarked             0
cabin_class          0
cabin_number         0
family_members       0
old_Ticket_column    0
dtype: int64

In [227]:
df_with_cabins[df_with_cabins["Embarked"].isna()]

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,cabin_class,cabin_number,family_members,old_Ticket_column
61,1,0,38.0,0,0,42,80.0,,1,28.0,0,113572
829,1,0,62.0,0,0,42,80.0,,1,28.0,0,113572


In [202]:
df[df["PassengerId"]<70]["Embarked"].value_counts()

Embarked
S    42
C    18
Q     8
Name: count, dtype: int64

In [210]:
[e for t, e in zip(list(df[df["PassengerId"]<70]["Ticket"]), list(df[df["PassengerId"]<70]["Embarked"])) if len(t.split(" ")) == 1]

['S',
 'S',
 'Q',
 'S',
 'S',
 'S',
 'C',
 'S',
 'S',
 'S',
 'S',
 'Q',
 'S',
 'S',
 'C',
 'S',
 'S',
 'Q',
 'S',
 'S',
 'S',
 'C',
 'S',
 'Q',
 'S',
 'Q',
 'S',
 'C',
 'S',
 'C',
 'S',
 'S',
 'C',
 'Q',
 'Q',
 'Q',
 'C',
 'S',
 'S',
 'S',
 'C',
 'S',
 'C',
 'C',
 nan,
 'S',
 'S',
 'C',
 'S']

In [218]:
[(e,f, p) for t, e, f, p in zip(list(df[(df["PassengerId"]<80) & (df["PassengerId"]>50)]["Ticket"]), \
    list(df[(df["PassengerId"]<80) & (df["PassengerId"]>50)]["Embarked"]),\
        list(df[(df["PassengerId"]<80) & (df["PassengerId"]>50)]["Fare"]),\
            list(df[(df["PassengerId"]<80) & (df["PassengerId"]>50)]["Pclass"])) if len(t.split(" ")) == 1]

[('S', 39.6875, 3),
 ('S', 26.0, 2),
 ('C', 61.9792, 1),
 ('S', 35.5, 1),
 ('C', 7.2292, 3),
 ('C', 7.2292, 3),
 (nan, 80.0, 1),
 ('S', 83.475, 1),
 ('S', 27.9, 3),
 ('C', 15.2458, 3),
 ('S', 7.925, 3),
 ('S', 8.6625, 3),
 ('C', 14.4542, 3),
 ('S', 56.4958, 3),
 ('S', 7.65, 3),
 ('S', 7.8958, 3),
 ('S', 8.05, 3),
 ('S', 29.0, 2)]

In [222]:
df_with_cabins.groupby(["Pclass", "cabin_class", "Embarked"])["Fare"].mean()

Pclass  cabin_class  Embarked
1       0            2.0          38.513025
                     3.0          41.535410
        1            2.0         163.892748
                     3.0          94.343674
        2            1.0          90.000000
                     2.0          97.541658
                     3.0         101.630442
        3            2.0          83.232708
                     3.0          51.253068
        4            2.0          81.919057
                     3.0          46.953409
        7            2.0          28.568033
                     3.0          23.709615
2       3            2.0          14.080550
                     3.0          13.248558
        4            1.0          12.350000
                     2.0          12.000000
                     3.0           8.890441
        5            2.0          33.460840
                     3.0          30.032051
3       4            3.0           9.995988
        5            1.0           7.715357
  

# training

In [241]:
df_with_cabins.drop("old_Ticket_column", axis=1, inplace=True)

In [242]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

categorical_cols = df_with_cabins.select_dtypes(include=['object']).columns
for col in categorical_cols:
    if col != 'target': 
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])


# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_with_cabins)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Train model
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train, y_train)

# # Cross-validation
# cv_scores = cross_val_score(log_reg, X_train, y_train, cv=5)
# print(f"Cross-validation scores: {cv_scores}")
# print(f"Mean CV score: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

In [245]:
# test
# Test Data Preprocessing Pipeline
import numpy as np
import pandas as pd
import re
import random
from collections import Counter
from sklearn.preprocessing import StandardScaler

# Load test data
df_test = pd.read_csv("data/test.csv")
print(f"Test data shape: {df_test.shape}")

# Keep PassengerId for final submission
passenger_ids = df_test["PassengerId"].copy()

# Remove PassengerId from features (same as training)
X_test = df_test.drop("PassengerId", axis=1)

print(f"Missing values in test data:")
print(X_test.isna().sum())

# 1. Gender to numeric
X_test["Sex"] = X_test["Sex"].map({'male': 1, "female": 0})

# 2. Drop Name column
X_test = X_test.drop("Name", axis=1)

# 3. Embarked column to numeric (handle missing values first)
X_test["Embarked"] = X_test["Embarked"].fillna("S")  # Fill missing with 'S' (most common)
X_test["Embarked"] = X_test["Embarked"].map({"S": 3, "Q": 1, "C": 2})

# 4. Cabin processing (using your existing function)
def process_cabin(c: str) -> tuple:
    if pd.isna(c):
        return None, None
    
    match = re.match(r"([A-Z])(\d+)$", c) # c29
    if match:
        alphabet_part = match.group(1)
        numeric_part = match.group(2)
        return alphabet_part, int(numeric_part) # c29 = c, 29
    
    else:
        if re.match(r"[A-Z]$", c): # single letter, C
            return c, 0
        
        else:
            if len(c.split(" ")) > 1: # C23 C25 C27 or F G29
                temp_cabin_class = []
                temp_cabin_number = []
                for sp in c.split(" "):
                    sp.strip()
                    cabin_class, cabin_number = process_cabin(sp)
                    temp_cabin_class.append(cabin_class)
                    temp_cabin_number.append(cabin_number)
                a_set = set(temp_cabin_class)
                if len(a_set) == 1:
                    return a_set.pop(), int(np.median(temp_cabin_number))
                else:
                    return "z", temp_cabin_number[-1] # F G29

# Extract cabin class and cabin number
X_test['cabin_class'] = X_test['Cabin'].apply(process_cabin).apply(lambda x: x[0] if x is not None else None)
X_test["cabin_number"] = X_test["Cabin"].apply(process_cabin).apply(lambda x: x[1] if x is not None else None)

# 5. Fill missing cabin class using your existing function
def fill_missing_cabin_class(df, fare_col='Fare', pclass_col='Pclass', cabin_class_col='cabin_class'):
    """
    Fill missing cabin class values based on fare and pclass similarity to known patterns.
    """
    df_filled = df.copy()
    
    # Calculate mean fare for each pclass-cabin_class combination
    fare_means = df.groupby([pclass_col, cabin_class_col])[fare_col].mean()
    
    # Find rows with missing cabin class
    missing_mask = df[cabin_class_col].isna()
    
    # For each missing row, find the best match
    for idx in df[missing_mask].index:
        passenger_fare = df.loc[idx, fare_col]
        passenger_pclass = df.loc[idx, pclass_col]
        
        # Get all cabin classes for this pclass
        try:
            available_classes = fare_means[passenger_pclass].index
            
            # Calculate absolute difference between passenger fare and mean fare for each cabin class
            fare_differences = {}
            for cabin_class in available_classes:
                mean_fare = fare_means[passenger_pclass][cabin_class]
                fare_differences[cabin_class] = abs(passenger_fare - mean_fare)
            
            # Find cabin class with smallest fare difference
            if fare_differences:
                best_match = min(fare_differences, key=fare_differences.get)
                df_filled.loc[idx, cabin_class_col] = best_match
            
        except KeyError:
            # No data for this pclass, skip or assign default
            continue
    
    return df_filled

X_test_filled = fill_missing_cabin_class(X_test)

# 6. Fill missing cabin numbers using your existing function
def assign_missing_cabins_by_pclass_range(df, pclass_col='Pclass', cabin_col='cabin_number'):
    """
    Assign missing cabin numbers randomly within each Pclass's existing range.
    Only fills NaN values, leaves existing values unchanged.
    """
    df_filled = df.copy()
    
    # Get cabin number ranges for each Pclass (only non-NaN values)
    pclass_ranges = {}
    
    for pclass in df[pclass_col].unique():
        # Get existing cabin numbers for this Pclass (excluding NaN)
        pclass_cabins = df[(df[pclass_col] == pclass) & (df[cabin_col].notna())][cabin_col]
        
        if len(pclass_cabins) > 0:
            # Since cabin_number is float64, directly get min/max
            pclass_ranges[pclass] = (int(pclass_cabins.min()), int(pclass_cabins.max()))
    
    # If no ranges available (all missing), use training data ranges
    if not pclass_ranges:
        # Default ranges based on training data patterns
        pclass_ranges = {1: (2, 148), 2: (9, 101), 3: (4, 95)}
        print("Using default cabin ranges from training data")
    
    # Assign missing values only
    for pclass in pclass_ranges.keys():
        # Get rows with missing cabin numbers for this Pclass
        missing_mask = (df_filled[pclass_col] == pclass) & (df_filled[cabin_col].isna())
        missing_indices = df_filled[missing_mask].index
        
        if len(missing_indices) > 0:
            min_cabin, max_cabin = pclass_ranges[pclass]
            
            # Generate random cabin numbers within range
            random_cabins = [float(random.randint(min_cabin, max_cabin)) for _ in range(len(missing_indices))]
            
            # Assign the random cabin numbers (keep as float to match dtype)
            df_filled.loc[missing_indices, cabin_col] = random_cabins
    
    return df_filled

X_test_with_cabins = assign_missing_cabins_by_pclass_range(X_test_filled)

# Drop original Cabin column
X_test_with_cabins.drop("Cabin", axis=1, inplace=True)

# 7. Convert cabin_class to categorical codes
X_test_with_cabins['cabin_class'] = pd.Categorical(X_test_with_cabins['cabin_class']).codes

# 8. Create family_members feature
X_test_with_cabins["family_members"] = X_test_with_cabins["SibSp"] + X_test_with_cabins["Parch"]

# 9. Process Ticket column
X_test_with_cabins["Ticket"] = X_test_with_cabins["Ticket"].apply(
    lambda x: x.split(" ")[0] if len(x.split(" ")) > 1 else "general"
)
X_test_with_cabins["Ticket"] = pd.Categorical(X_test_with_cabins["Ticket"]).codes

# 10. Fill missing Age values with mean age by Pclass
X_test_with_cabins['Age'] = X_test_with_cabins.groupby('Pclass')['Age'].transform(
    lambda x: x.fillna(x.mean())
)

# 11. Fill missing Fare values (test data might have missing fares)
if X_test_with_cabins['Fare'].isna().any():
    X_test_with_cabins['Fare'] = X_test_with_cabins.groupby('Pclass')['Fare'].transform(
        lambda x: x.fillna(x.mean())
    )

print(f"\nMissing values after preprocessing:")
print(X_test_with_cabins.isna().sum())

print(f"\nFinal test data shape: {X_test_with_cabins.shape}")
print(f"Test data columns: {list(X_test_with_cabins.columns)}")

# Make sure test data has same columns as training data
print(f"Training data columns: {list(df_with_cabins.columns)}")

# Reorder columns to match training data
X_test_final = X_test_with_cabins[df_with_cabins.columns]

print(f"\nFinal test data shape after column alignment: {X_test_final.shape}")

# Scale the test data using the same scaler fitted on training data
# Note: You should use the same scaler that was fitted on training data
scaler = StandardScaler()
# First fit on training data (you should use your existing fitted scaler)
scaler.fit(df_with_cabins)
X_test_scaled = scaler.transform(X_test_final)

print(f"Test data successfully preprocessed and scaled!")
print(f"Ready for prediction with shape: {X_test_scaled.shape}")

# Now you can make predictions
# predictions = log_reg.predict(X_test_scaled)
# predictions_proba = log_reg.predict_proba(X_test_scaled)[:, 1]

# Create submission dataframe
# submission = pd.DataFrame({
#     'PassengerId': passenger_ids,
#     'Survived': predictions
# })
# submission.to_csv('submission.csv', index=False)

Test data shape: (418, 11)
Missing values in test data:
Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

Missing values after preprocessing:
Pclass            0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              0
Embarked          0
cabin_class       0
cabin_number      0
family_members    0
dtype: int64

Final test data shape: (418, 11)
Test data columns: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked', 'cabin_class', 'cabin_number', 'family_members']
Training data columns: ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Embarked', 'cabin_class', 'cabin_number', 'family_members']

Final test data shape after column alignment: (418, 11)
Test data successfully preprocessed and scaled!
Ready for prediction with shape: (418, 11)


In [246]:
log_reg.predict(X_test_scaled)

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [247]:
pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': log_reg.predict(X_test_scaled)
}).to_csv('submission.csv', index=False)