In [107]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import seaborn as sns



In [108]:
df: pd.DataFrame = sns.load_dataset("titanic")

df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [109]:
def joinstr(iter):

    return " ".join(
        i.__str__() for i in iter
    )


agg: pd.DataFrame = df.loc[:, df.columns.isin(["survived","pclass"])].agg(joinstr, axis=1)

agg.head()

0    0 3
1    1 1
2    1 3
3    1 1
4    0 3
dtype: object

In [110]:
agg.value_counts()

0 3    372
1 1    136
1 3    119
0 2     97
1 2     87
0 1     80
dtype: int64

In [111]:
train_indices, test_indices, _a, _b = train_test_split(
    df.index,
    agg,
    test_size=0.2,
    random_state=42
    #stratify=df.loc[:, df.columns.isin(["x","y","z"])].agg(" ".join, axis=1).values
)

print(f"{type(train_indices)}, {type(test_indices)}")

print(f"{train_indices.shape} {test_indices.shape}")

<class 'pandas.core.indexes.numeric.Int64Index'>, <class 'pandas.core.indexes.numeric.Int64Index'>
(712,) (179,)


In [112]:
df.loc[train_indices, :].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
331,0,1,male,45.5,0,0,28.5,S,First,man,True,C,Southampton,no,True
733,0,2,male,23.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
382,0,3,male,32.0,0,0,7.925,S,Third,man,True,,Southampton,no,True
704,0,3,male,26.0,1,0,7.8542,S,Third,man,True,,Southampton,no,False
813,0,3,female,6.0,4,2,31.275,S,Third,child,False,,Southampton,no,False


In [113]:
agg.loc[train_indices].value_counts()

0 3    302
1 1     99
1 3     96
0 2     78
1 2     73
0 1     64
dtype: int64

In [114]:
train_vals = dict(
    (val, agg.loc[train_indices].value_counts()[val]/train_indices.shape[0])
    for val in agg.unique()
)

for k, v in train_vals.items():
    print(f"{k}: {v}")

0 3: 0.4241573033707865
1 1: 0.13904494382022473
1 3: 0.1348314606741573
0 1: 0.0898876404494382
1 2: 0.10252808988764045
0 2: 0.10955056179775281


In [115]:
df.loc[test_indices, :].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
709,1,3,male,,1,1,15.2458,C,Third,man,True,,Cherbourg,yes,False
439,0,2,male,31.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
840,0,3,male,20.0,0,0,7.925,S,Third,man,True,,Southampton,no,True
720,1,2,female,6.0,0,1,33.0,S,Second,child,False,,Southampton,yes,False
39,1,3,female,14.0,1,0,11.2417,C,Third,child,False,,Cherbourg,yes,False


In [116]:
agg.loc[test_indices].value_counts()

0 3    70
1 1    37
1 3    23
0 2    19
0 1    16
1 2    14
dtype: int64

In [117]:
test_vals = dict(
    (val, agg.loc[test_indices].value_counts()[val]/test_indices.shape[0])
    for val in agg.unique()
)

for k, v in test_vals.items():
    print(f"{k}: {v}")

0 3: 0.39106145251396646
1 1: 0.20670391061452514
1 3: 0.12849162011173185
0 1: 0.0893854748603352
1 2: 0.0782122905027933
0 2: 0.10614525139664804
