In [29]:
import numpy as np 
import pandas as pd

In [30]:
df = pd.read_csv('titanic_custom_100rows.csv', usecols=['Survived', 'cabin', 'number', 'Ticket'])
df.head()

Unnamed: 0,Survived,cabin,number,Ticket
0,0,D12,1,d22222
1,1,f33,2,f33333
2,0,22,3,2
3,0,w22,4,w22222
4,0,3,d,333333


In [31]:
df['number'].unique()

array(['1', '2', '3', '4', 'd', 'f', 'g', 'h', '5'], dtype=object)

In [32]:
df['number_numerical'] = pd.to_numeric(df['number'], errors='coerce', downcast='integer')
df['number_categorical'] = np.where(df['number_numerical'].isnull(), df['number'], np.nan)
df['number_numerical'] = df['number_numerical'].fillna(0)
df['number_categorical'] = df['number_categorical'].fillna('Missing')
df

Unnamed: 0,Survived,cabin,number,Ticket,number_numerical,number_categorical
0,0,D12,1,d22222,1.0,Missing
1,1,f33,2,f33333,2.0,Missing
2,0,22,3,2,3.0,Missing
3,0,w22,4,w22222,4.0,Missing
4,0,3,d,333333,0.0,d
5,1,f44,f,f44444,0.0,f
6,0,d33,g,d33333,0.0,g
7,0,d33,h,d33333,0.0,h
8,0,ss33,3,ss3333333333,3.0,Missing
9,1,ff33,f,ff3333333333,0.0,f


In [33]:
df['cabin'].unique()

array(['D12', 'f33', '22', 'w22', '3', 'f44', 'd33', 'ss33', 'ff33',
       's22'], dtype=object)

In [34]:
df['cabin_num'] = df['cabin'].str.extract(r'(\d+)')
df['cabin_cat'] = df['cabin'].str[0]
df

Unnamed: 0,Survived,cabin,number,Ticket,number_numerical,number_categorical,cabin_num,cabin_cat
0,0,D12,1,d22222,1.0,Missing,12,D
1,1,f33,2,f33333,2.0,Missing,33,f
2,0,22,3,2,3.0,Missing,22,2
3,0,w22,4,w22222,4.0,Missing,22,w
4,0,3,d,333333,0.0,d,3,3
5,1,f44,f,f44444,0.0,f,44,f
6,0,d33,g,d33333,0.0,g,33,d
7,0,d33,h,d33333,0.0,h,33,d
8,0,ss33,3,ss3333333333,3.0,Missing,33,s
9,1,ff33,f,ff3333333333,0.0,f,33,f


In [35]:
df['Ticket'].unique()

array(['d22222', 'f33333', '2', 'w22222', '333333', 'f44444', 'd33333',
       'ss3333333333', 'ff3333333333', 's22222'], dtype=object)

In [36]:
# extract the last bit of ticket as number
df['ticket_num'] = df['Ticket'].apply(lambda s: s.split()[-1])
df['ticket_num'] = pd.to_numeric(
    df['ticket_num'],
    errors='coerce',
    downcast='integer'
)

# extract the first part of ticket as category
df['ticket_cat'] = df['Ticket'].apply(lambda s: s.split()[0])
df['ticket_cat'] = np.where(
    df['ticket_cat'].str.isdigit(),
    np.nan,
    df['ticket_cat']
)

df['ticket_num'] = df['ticket_num'].fillna(0)
df['ticket_cat'] = df['ticket_cat'].fillna('Missing') 

df

Unnamed: 0,Survived,cabin,number,Ticket,number_numerical,number_categorical,cabin_num,cabin_cat,ticket_num,ticket_cat
0,0,D12,1,d22222,1.0,Missing,12,D,0.0,d22222
1,1,f33,2,f33333,2.0,Missing,33,f,0.0,f33333
2,0,22,3,2,3.0,Missing,22,2,2.0,Missing
3,0,w22,4,w22222,4.0,Missing,22,w,0.0,w22222
4,0,3,d,333333,0.0,d,3,3,333333.0,Missing
5,1,f44,f,f44444,0.0,f,44,f,0.0,f44444
6,0,d33,g,d33333,0.0,g,33,d,0.0,d33333
7,0,d33,h,d33333,0.0,h,33,d,0.0,d33333
8,0,ss33,3,ss3333333333,3.0,Missing,33,s,0.0,ss3333333333
9,1,ff33,f,ff3333333333,0.0,f,33,f,0.0,ff3333333333
