In [11]:
import pandas as pd

from sklearn.experimental import enable_iterative_imputer
from sklearn.utils import resample
from sklearn import ensemble, impute, model_selection, preprocessing, tree, linear_model, feature_selection
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

In [2]:
def tweak_titanic(df):
    return df.drop(['name', 'ticket', 'home.dest', 'boat', 'body', 'cabin'], axis=1) \
             .pipe(pd.get_dummies, drop_first=True)

def get_train_test_X_y(df, y_col, test_size=0.3, std_cols=None):
    y = df[y_col]
    x = df.drop(y_col, axis=1)

    num_cols = [
        "pclass",
        "age",
        "sibsp",
        "parch",
        "fare"]

    X_train, X_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=test_size, random_state=42)
    fi = impute.IterativeImputer()

    X_train.loc[:, num_cols]=fi.fit_transform(X_train[num_cols])
    X_test.loc[:, num_cols] = fi.transform(X_test[num_cols])

    if std_cols:
        std = preprocessing.StandardScaler()
        X_train.loc[:, std_cols] = std.fit_transform(X_train[std_cols])
        X_test.loc[:, std_cols] = std.transform(X_test[std_cols])

    return X_train, X_test, y_train, y_test

In [3]:
df = pd.read_csv('titanic3.csv')
ti_df = tweak_titanic(df)
std_cols = "pclass,age,sibsp,fare".split(",")
X_train, X_test, y_train, y_test = get_train_test_X_y(
    ti_df, "survived", std_cols=std_cols
)

X = pd.concat([X_train, X_test])
y = pd.concat([y_train, y_test])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set 

In [4]:
y.value_counts()

0    809
1    500
Name: survived, dtype: int64

In [5]:
# Upsample
mask = df['survived']==1
surv_df = df[mask]
death_df = df[~mask]

df_upsample = resample(
    surv_df,
    replace=True,
    n_samples=len(death_df),
    random_state=42)

df2 = pd.concat([death_df, df_upsample])
df2['survived'].value_counts()

0    809
1    809
Name: survived, dtype: int64

In [9]:
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X, y)
pd.Series(y_ros).value_counts()

0    809
1    809
Name: survived, dtype: int64

In [12]:
sm = SMOTE(random_state=42)
X_sm, y_sm = sm.fit_resample(X, y)
pd.Series(y_sm).value_counts()

0    809
1    809
Name: survived, dtype: int64

In [13]:
# This method is similar to SMOTE but it generates different number of samples depending on an estimate of the local distribution of the class to be oversampled.

ady = ADASYN(random_state=42)
X_ady, y_ady = ady.fit_resample(X, y)
pd.Series(y_ady).value_counts()

0    809
1    777
Name: survived, dtype: int64