In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv"
df = pd.read_csv(url)

In [4]:
print(df.shape)
df.sample(20)

(891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
469,1,3,female,0.75,2,1,19.26,C,Third,child,False,,Cherbourg,yes,False
818,0,3,male,43.0,0,0,6.45,S,Third,man,True,,Southampton,no,True
219,0,2,male,30.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
281,0,3,male,28.0,0,0,7.85,S,Third,man,True,,Southampton,no,True
738,0,3,male,,0,0,7.9,S,Third,man,True,,Southampton,no,True
282,0,3,male,16.0,0,0,9.5,S,Third,man,True,,Southampton,no,True
443,1,2,female,28.0,0,0,13.0,S,Second,woman,False,,Southampton,yes,True
290,1,1,female,26.0,0,0,78.85,S,First,woman,False,,Southampton,yes,True
409,0,3,female,,3,1,25.47,S,Third,woman,False,,Southampton,no,False
149,0,2,male,42.0,0,0,13.0,S,Second,man,True,,Southampton,no,True


In [5]:
df.nunique()

survived         2
pclass           3
sex              2
age             88
sibsp            7
parch            7
fare           248
embarked         3
class            3
who              3
adult_male       2
deck             7
embark_town      3
alive            2
alone            2
dtype: int64

In [6]:
df.isnull().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [7]:
df.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [8]:
print(len(df[df['survived'] == 1]))
print(len(df[df['survived'] == 0]))

342
549


### Заполнение пропусков и удаление не нужных фичей

In [9]:
df["age"] = df.groupby(["sex", "pclass"])["age"].transform(
    lambda x: x.fillna(x.median())
)

In [10]:
dfFM = df.copy()

In [11]:
dfFM.drop(columns=["class", 'embark_town', "alive", "deck"], inplace=True)

In [12]:
dfFM['sex'].replace({"male": 1, "female": 0}, inplace=True)

In [13]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="most_frequent")
dfFM[['embarked']] = imputer.fit_transform(dfFM[['embarked']])

In [14]:
dfFM.isnull().sum()

survived      0
pclass        0
sex           0
age           0
sibsp         0
parch         0
fare          0
embarked      0
who           0
adult_male    0
alone         0
dtype: int64

### Pipeline

In [15]:
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import OrdinalEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.model_selection import train_test_split
# from category_encoders import TargetEncoder
# from sklearn.ensemble import RandomForestClassifier
#
#
# X, y = dfFM.drop(columns="survived"), dfFM["survived"]
#
# numeric_features = ["age", "fare", "sibsp", "parch", "adult_male", "alone"]
# categorical_features = ["pclass", "embarked", "who"]
# bin_categorical_features = ["sex"]
#
# bin_pipe = Pipeline([
#     ("encoder", OrdinalEncoder()),
# ])
#
# cat_pipe = Pipeline([
#     ("encoder", TargetEncoder()),
# ])
#
# preprocessor = ColumnTransformer([
#     ('num', 'passthrough', numeric_features),  # просто передает как есть
#     ('bin', bin_pipe, bin_categorical_features),
#     ('cat', cat_pipe, categorical_features)
# ])
#
# model = Pipeline([
#     ("preprocessor", preprocessor),
#     ("classifier", RandomForestClassifier())
# ])
#
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# model.fit(X_train, y_train)

In [16]:
dfFM.to_csv("dfFM.csv")

### Without Pipeline

In [43]:
from sklearn.model_selection import train_test_split

X, y = dfFM.drop(columns="survived"), dfFM["survived"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

num_features = ["age", "fare", "sibsp", "parch", "adult_male", "alone"]
cat_features = ["pclass", "embarked", "who"]
bin_cat_features = ["sex"]

In [44]:
from category_encoders import TargetEncoder

encoder = TargetEncoder(cols=cat_features)
X_train = encoder.fit_transform(X_train, y_train)
X_test = encoder.transform(X_test)

In [22]:
# from sklearn.preprocessing import OrdinalEncoder
#
# encoder = OrdinalEncoder()
# X_train = encoder.fit_transform(X_train[bin_cat_features])
# X_test = encoder.transform(X_test[bin_cat_features])

In [45]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [46]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

y_pred = model.predict(X_train)
print(accuracy_score(y_train, y_pred))
print(precision_score(y_train, y_pred))
print(recall_score(y_train, y_pred))
print(f1_score(y_train, y_pred))
print(roc_auc_score(y_train, y_pred))
print()
y_pred = model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(precision_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

0.9831460674157303
0.9887640449438202
0.967032967032967
0.9777777777777777
0.9800996270244562

0.8212290502793296
0.7846153846153846
0.7391304347826086
0.7611940298507462
0.8059288537549407


In [None]:
# 0.7893258426966292
# 0.7430830039525692
# 0.6886446886446886
# 0.714828897338403
# 0.7702904536617521
#
# 0.776536312849162
# 0.7377049180327869
# 0.6521739130434783
# 0.6923076923076923
# 0.7533596837944663