In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
from datetime import date
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import MinMaxScaler, LabelEncoder,StandardScaler,RobustScaler,OneHotEncoder

def load_titanic():
    data = pd.read_csv("titanic.csv")
    return data 

df = load_titanic()
df = df.drop("PassengerId",axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
df.columns = [col.lower() for col in df.columns]

In [3]:
def grab_col_names(df,cat_th=10,car_th=20):
    num_cols = [col for col in df.columns if np.issubdtype(df[col].dtype, np.number)]
    cat_cols = [col for col in df.columns if col not in num_cols]
    num_but_cat =  [col for col in num_cols if df[col].nunique()<cat_th]
    cat_but_car = [col for col in cat_cols if df[col].nunique()>car_th]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"cat_cols = {len(cat_cols)}")
    print(f"num_cols ={len(num_cols)}")
    print(f"cat_but_car = {len(cat_but_car)}")
    print(f"num_but_cat={len(num_but_cat)}")
        
    return cat_cols, num_cols,cat_but_car

cat_cols, num_cols,cat_but_car=grab_col_names(df)

cat_cols = 6
num_cols =2
cat_but_car = 3
num_but_cat=4


In [4]:
df.isnull().sum()

survived      0
pclass        0
name          0
sex           0
age         177
sibsp         0
parch         0
ticket        0
fare          0
cabin       687
embarked      2
dtype: int64

In [5]:
df.groupby("embarked")["survived"].mean()

embarked
C    0.553571
Q    0.389610
S    0.336957
Name: survived, dtype: float64

In [6]:
def missing_vs_target(df,target):
    na_cols = [col for col in df.columns if df[col].isnull().sum()>0]
    temp_df=df.copy()
    for col in na_cols:
        temp_df[col+"_NA_Flag"] = np.where(temp_df[col].isnull(),1,0)
    na_flags=[col for col in temp_df.columns if "_NA_" in col]
    for col in na_flags:
        print(pd.DataFrame({"Target_Mean":temp_df.groupby(col)[target].mean(),
                            "Count":temp_df.groupby(col)[target].count()}))
        print("-----------------------------------------------")

missing_vs_target(df,"survived")

             Target_Mean  Count
age_NA_Flag                    
0               0.406162    714
1               0.293785    177
-----------------------------------------------
               Target_Mean  Count
cabin_NA_Flag                    
0                 0.666667    204
1                 0.299854    687
-----------------------------------------------
                  Target_Mean  Count
embarked_NA_Flag                    
0                    0.382452    889
1                    1.000000      2
-----------------------------------------------


In [7]:
df["embarked"].fillna("C",inplace=True)

In [8]:
df["new_cabin"] = df["cabin"].notnull().astype(int)

In [9]:
encoders = {}  # Sütun isimlerini ve onlara karşılık gelen encoderları saklayacak bir sözlük oluştur

# Verideki her kategorik sütunu dönüştür
for col in ["sex","embarked"]:
    encoder = LabelEncoder()
    
    # Eğer sütun NaN değerler içeriyorsa
    if df[col].isna().any():
        # NaN değerlerini geçici değerle doldur
        df[col].fillna('NaN', inplace=True)
        
        # LabelEncoder uygula
        df[col] = encoder.fit_transform(df[col])
        
        # Geçici 'NaN' değerlerini np.nan ile tekrar doldur
        df[col].replace({encoder.transform(['NaN'])[0]: np.nan}, inplace=True)
    else:
        # NaN değerler yoksa doğrudan LabelEncoder uygula
        df[col] = encoder.fit_transform(df[col])

    # Bu encoder'ı sözlükte sakla
    encoders[col] = encoder

In [10]:
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,new_cabin
0,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2,0
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0,1
2,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2,0
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2,1
4,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2,0


In [11]:
cat_cols, num_cols,cat_but_car=grab_col_names(df)

cat_cols = 7
num_cols =2
cat_but_car = 3
num_but_cat=7


In [12]:
df[cat_cols+num_cols]

Unnamed: 0,survived,pclass,sex,sibsp,parch,embarked,new_cabin,age,fare
0,0,3,1,1,0,2,0,22.0,7.2500
1,1,1,0,1,0,0,1,38.0,71.2833
2,1,3,0,0,0,2,0,26.0,7.9250
3,1,1,0,1,0,2,1,35.0,53.1000
4,0,3,1,0,0,2,0,35.0,8.0500
...,...,...,...,...,...,...,...,...,...
886,0,2,1,0,0,2,0,27.0,13.0000
887,1,1,0,0,0,2,1,19.0,30.0000
888,0,3,0,1,2,2,0,,23.4500
889,1,1,1,0,0,0,1,26.0,30.0000


In [13]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from xgboost import XGBRegressor
imp= IterativeImputer(
    estimator=XGBRegressor(n_estimators=200),
    max_iter=30,
    random_state=0
).set_output(transform="pandas")

df[cat_cols+num_cols]=imp.fit_transform(df[cat_cols+num_cols])

In [15]:
df.isnull().sum()

survived       0
pclass         0
name           0
sex            0
age            0
sibsp          0
parch          0
ticket         0
fare           0
cabin        687
embarked       0
new_cabin      0
dtype: int64

In [16]:
for col in ["sex","embarked"]:
    df[col] = encoders[col].inverse_transform(df[col].round().astype(int))

In [17]:
df.head()

Unnamed: 0,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,new_cabin
0,0.0,3.0,"Braund, Mr. Owen Harris",male,22.0,1.0,0.0,A/5 21171,7.25,,S,0.0
1,1.0,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1.0,0.0,PC 17599,71.2833,C85,C,1.0
2,1.0,3.0,"Heikkinen, Miss. Laina",female,26.0,0.0,0.0,STON/O2. 3101282,7.925,,S,0.0
3,1.0,1.0,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1.0,0.0,113803,53.1,C123,S,1.0
4,0.0,3.0,"Allen, Mr. William Henry",male,35.0,0.0,0.0,373450,8.05,,S,0.0


In [18]:
df["new_cabin"] = df["cabin"].notnull().astype(int)
df["name_letter_count"] = df["name"].str.len()
df["name_word_count"] = df["name"].apply(lambda x: len(str(x).split()))
df["new_name_dr"] = df["name"].apply(lambda x: len([a for a in x.split() if a.startswith("Dr.")]))
df["new_title"] = df["name"].str.extract(" ([A-Za-z]+)\.",expand = False)
df["new_family_size"] = df["sibsp"] + df["parch"] +1
df["new_age_pclass"] = df["age"] * df["pclass"]
df.loc[(df["sibsp"]+df["parch"])>0,"is_alone"] = 0
df.loc[(df["sibsp"]+df["parch"]==0),"is_alone"] = 1
df.loc[(df["age"]<18),"new_age_cat"] = "young"
df.loc[(df["age"]>=18) & (df["age"]<50),"new_age_cat"] = "mature"
df.loc[(df["age"]>=50),"new_age_cat"] = "senior"

df.loc[(df["sex"]=="male") & (df["age"]<=21),"new_sex_cat"] ="youngmale"
df.loc[(df["sex"]=="male") & (df["age"]>21) & (df["age"]<=50),"new_sex_cat"] = "maturemale"
df.loc[(df["sex"]=="male") & (df["age"]>50),"new_sex_cat"] = "seniormale"

df.loc[(df["sex"]=="female") & (df["age"]<=21),"new_sex_cat"] ="youngfemale"
df.loc[(df["sex"]=="female") & (df["age"]>21) & (df["age"]<=50),"new_sex_cat"] = "maturefemale"
df.loc[(df["sex"]=="female") & (df["age"]>50),"new_sex_cat"] = "seniorfemale"
df.shape

(891, 21)

In [19]:
def grab_col_names(df,cat_th=10,car_th=20):
    num_cols = [col for col in df.columns if np.issubdtype(df[col].dtype, np.number)]
    cat_cols = [col for col in df.columns if col not in num_cols]
    num_but_cat =  [col for col in num_cols if df[col].nunique()<cat_th]
    cat_but_car = [col for col in cat_cols if df[col].nunique()>car_th]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in num_cols if col not in num_but_cat]
    print(f"cat_cols = {len(cat_cols)}")
    print(f"num_cols ={len(num_cols)}")
    print(f"cat_but_car = {len(cat_but_car)}")
    print(f"num_but_cat={len(num_but_cat)}")
        
    return cat_cols, num_cols,cat_but_car

cat_cols, num_cols,cat_but_car=grab_col_names(df)

cat_cols = 14
num_cols =4
cat_but_car = 3
num_but_cat=9


In [20]:
def outliers(df,variable):
    q1 = df[variable].quantile(0.05)
    q3 = df[variable].quantile(0.95)
    iqr = q3-q1
    lower_lim = q1 - 1.5*iqr
    upper_lim = q3 + 1.5*iqr
    return lower_lim, upper_lim

def check_outliers(df, variable, index=False):
    lower_lim, upper_lim = outliers(df, variable)
    
    if index:
        return df.loc[(df[variable] < lower_lim) | (df[variable] > upper_lim)].index.tolist()
    
    else:
        if df.loc[(df[variable] < lower_lim) | (df[variable] > upper_lim)].any(axis=None):
            return True
        else:
            return False

In [21]:
for col in num_cols:
    print(col,check_outliers(df,col))

age False
fare True
name_letter_count False
new_age_pclass False


In [22]:
def replace_outliers(df,variable):
    lower_lim, upper_lim = outliers(df,variable)
    df[variable].clip(lower=lower_lim, upper=upper_lim,inplace=True)
replace_outliers(df,"fare")

In [23]:
for col in num_cols:
    print(col,check_outliers(df,col))

age False
fare False
name_letter_count False
new_age_pclass False


In [24]:
remove_cols=list(df[cat_but_car].columns)
remove_cols

['name', 'ticket', 'cabin']

In [25]:
def missing_values_table(df,na_name=False):
    na_cols = [col for col in df.columns if df[col].isnull().sum()>0]
    n_miss = df[na_cols].isnull().sum().sort_values(ascending=False)
    ratio = (df[na_cols].isnull().sum() / len(df)*100).sort_values(ascending=False)
    missing_df = pd.concat([n_miss,np.round(ratio,2)],axis=1,keys=["n_miss","ratio"])
    
    return na_cols,missing_df

na_cols,missing_df = missing_values_table(df)
missing_df

Unnamed: 0,n_miss,ratio
cabin,687,77.1


In [26]:
df.drop(remove_cols,axis=1,inplace=True)

In [27]:
df[["new_sex_cat","age"]]

Unnamed: 0,new_sex_cat,age
0,maturemale,22.000000
1,maturefemale,38.000000
2,maturefemale,26.000000
3,maturefemale,35.000000
4,maturemale,35.000000
...,...,...
886,maturemale,27.000000
887,youngfemale,19.000000
888,youngfemale,11.462794
889,maturemale,26.000000


In [28]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,new_cabin,name_letter_count,name_word_count,new_name_dr,new_title,new_family_size,new_age_pclass,is_alone,new_age_cat,new_sex_cat
0,0.0,3.0,male,22.0,1.0,0.0,7.25,S,0,23,4,0,Mr,2.0,66.0,0.0,mature,maturemale
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C,1,51,7,0,Mrs,2.0,38.0,0.0,mature,maturefemale
2,1.0,3.0,female,26.0,0.0,0.0,7.925,S,0,22,3,0,Miss,1.0,78.0,1.0,mature,maturefemale
3,1.0,1.0,female,35.0,1.0,0.0,53.1,S,1,44,7,0,Mrs,2.0,35.0,0.0,mature,maturefemale
4,0.0,3.0,male,35.0,0.0,0.0,8.05,S,0,24,4,0,Mr,1.0,105.0,1.0,mature,maturemale


In [29]:
binary_cols = [col for col in df.columns if not np.issubdtype(df[col],np.number) and df[col].nunique()==2]
binary_cols

['sex']

In [30]:
def label_encoder(df,binary_col):
    le = LabelEncoder()
    df[binary_col] = le.fit_transform(df[binary_col])
    
for col in binary_cols:
    label_encoder(df,col)

In [32]:
df[cat_cols]

Unnamed: 0,sex,embarked,new_title,new_age_cat,new_sex_cat,survived,pclass,sibsp,parch,new_cabin,name_word_count,new_name_dr,new_family_size,is_alone
0,1,S,Mr,mature,maturemale,0.0,3.0,1.0,0.0,0,4,0,2.0,0.0
1,0,C,Mrs,mature,maturefemale,1.0,1.0,1.0,0.0,1,7,0,2.0,0.0
2,0,S,Miss,mature,maturefemale,1.0,3.0,0.0,0.0,0,3,0,1.0,1.0
3,0,S,Mrs,mature,maturefemale,1.0,1.0,1.0,0.0,1,7,0,2.0,0.0
4,1,S,Mr,mature,maturemale,0.0,3.0,0.0,0.0,0,4,0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,1,S,Rev,mature,maturemale,0.0,2.0,0.0,0.0,0,3,0,1.0,1.0
887,0,S,Miss,mature,youngfemale,1.0,1.0,0.0,0.0,1,4,0,1.0,1.0
888,0,S,Miss,young,youngfemale,0.0,3.0,1.0,2.0,0,5,0,4.0,0.0
889,1,C,Mr,mature,maturemale,1.0,1.0,0.0,0.0,1,4,0,1.0,1.0


In [35]:
df.shape

(891, 18)

In [33]:
df[num_cols]

Unnamed: 0,age,fare,name_letter_count,new_age_pclass
0,22.000000,7.2500,23,66.000000
1,38.000000,71.2833,51,38.000000
2,26.000000,7.9250,22,78.000000
3,35.000000,53.1000,44,35.000000
4,35.000000,8.0500,24,105.000000
...,...,...,...,...
886,27.000000,13.0000,21,54.000000
887,19.000000,30.0000,28,19.000000
888,11.462794,23.4500,40,34.388383
889,26.000000,30.0000,21,26.000000


In [None]:
def rare_analysis(df,target,cat_cols):
    for col in cat_cols:
        print(col,":",len(df[col].value_counts()))
        df_ratio = (df[col].value_counts() /len(df)).sort_values(ascending=False)
        df_count = df[col].value_counts().loc[df_ratio.index]
        df_target_mean = df.groupby(col)[target].mean().loc[df_ratio.index]
        print(pd.DataFrame({"Ratio": df_ratio,
              "Count": df_count,
              "Target_Mean": df_target_mean}))
        print("-------------------------------------------------------")
        
rare_analysis(df,"survived",cat_cols)

In [38]:
def merge_c(df, cols, target_col, threshold):
    temp_df = df.copy()
    for col in cols:
        unique_categories = temp_df[col].value_counts().index.tolist()

        # Eğer yalnızca bir tane kategori varsa, birleştirme işlemine gerek yoktur
        if len(unique_categories) > 1:
            for category in unique_categories:
                category_freq = temp_df[temp_df[col] == category].shape[0] / len(temp_df)

                # Eğer kategori frekansı eşik değerinin altındaysa, birleştirme işlemi gerçekleştirilir
                if category_freq < threshold:
                    # Kendi kategorisini ve hedef ortalamalarını al
                    own_target_mean = temp_df[temp_df[col] == category][target_col].mean()
                    
                    # Kendi dışındaki kategorilerin hedef ortalamalarını al
                    other_categories = temp_df.loc[temp_df[col] != category, [col, target_col]]
                    other_target_means = other_categories.groupby(col)[target_col].mean()
                    
                    # Kendi hedef ortalamasına en yakın olan kategoriyi bul
                    closest_category = (other_target_means - own_target_mean).abs().idxmin()
                    
                    # Kendi kategorisini, en yakın olan kategori ile değiştir
                    temp_df[col] = temp_df[col].replace(category, closest_category)
            
    return temp_df

df2=merge_c(df,cat_cols,"survived",0.05)

In [None]:
rare_analysis(df2,"survived",cat_cols)

In [41]:
from feature_engine.selection import DropDuplicateFeatures, DropConstantFeatures
sel = DropConstantFeatures(tol=0.95,variables=None,missing_values="raise")
sel.fit(df2)
sel.features_to_drop_

['new_name_dr']

In [42]:
df2 = sel.transform(df2)

In [45]:
[col for col in df2.columns if not np.issubdtype(df[col],np.number) or df[col].nunique()<10]

['survived',
 'pclass',
 'sex',
 'sibsp',
 'parch',
 'embarked',
 'new_cabin',
 'name_word_count',
 'new_title',
 'new_family_size',
 'is_alone',
 'new_age_cat',
 'new_sex_cat']

In [48]:
ohe_cols=[col for col in df2.columns if not np.issubdtype(df[col],np.number)]
ohe_cols

['embarked', 'new_title', 'new_age_cat', 'new_sex_cat']

In [47]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,new_cabin,name_letter_count,name_word_count,new_name_dr,new_title,new_family_size,new_age_pclass,is_alone,new_age_cat,new_sex_cat
0,0.0,3.0,1,22.0,1.0,0.0,7.25,S,0,23,4,0,Mr,2.0,66.0,0.0,mature,maturemale
1,1.0,1.0,0,38.0,1.0,0.0,71.2833,C,1,51,7,0,Mrs,2.0,38.0,0.0,mature,maturefemale
2,1.0,3.0,0,26.0,0.0,0.0,7.925,S,0,22,3,0,Miss,1.0,78.0,1.0,mature,maturefemale
3,1.0,1.0,0,35.0,1.0,0.0,53.1,S,1,44,7,0,Mrs,2.0,35.0,0.0,mature,maturefemale
4,0.0,3.0,1,35.0,0.0,0.0,8.05,S,0,24,4,0,Mr,1.0,105.0,1.0,mature,maturemale


In [49]:
from feature_engine.encoding import OneHotEncoder
ohe = OneHotEncoder(variables=ohe_cols,drop_last=True)
df2 = ohe.fit_transform(df2)

In [52]:
df2.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,new_cabin,name_letter_count,name_word_count,...,embarked_C,new_title_Mr,new_title_Mrs,new_title_Miss,new_age_cat_mature,new_age_cat_senior,new_sex_cat_maturemale,new_sex_cat_maturefemale,new_sex_cat_seniormale,new_sex_cat_youngmale
0,0.0,3.0,1,22.0,1.0,0.0,7.25,0,23,4,...,0,1,0,0,1,0,1,0,0,0
1,1.0,1.0,0,38.0,1.0,0.0,71.2833,1,51,6,...,1,0,1,0,1,0,0,1,0,0
2,1.0,3.0,0,26.0,0.0,0.0,7.925,0,22,3,...,0,0,0,1,1,0,0,1,0,0
3,1.0,1.0,0,35.0,1.0,0.0,53.1,1,44,6,...,0,0,1,0,1,0,0,1,0,0
4,0.0,3.0,1,35.0,0.0,0.0,8.05,0,24,4,...,0,1,0,0,1,0,1,0,0,0


In [55]:
from sklearn.preprocessing import RobustScaler
rb = RobustScaler().set_output(transform ="pandas")
df3 = rb.fit_transform(df2)

In [56]:
df3.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,new_cabin,name_letter_count,name_word_count,...,embarked_C,new_title_Mr,new_title_Mrs,new_title_Miss,new_age_cat_mature,new_age_cat_senior,new_sex_cat_maturemale,new_sex_cat_maturefemale,new_sex_cat_seniormale,new_sex_cat_youngmale
0,0.0,0.0,0.0,-0.338337,1.0,0.0,-0.312011,0.0,-0.2,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,-2.0,-1.0,0.563896,1.0,0.0,2.461242,1.0,2.6,2.0,...,1.0,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,-1.0,-0.112779,0.0,0.0,-0.282777,0.0,-0.3,-1.0,...,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,-2.0,-1.0,0.394727,1.0,0.0,1.673732,1.0,1.9,2.0,...,0.0,-1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.394727,0.0,0.0,-0.277363,0.0,-0.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [139]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
X = df3.drop("survived",axis=1)
y = df3["survived"]
rf = RandomForestClassifier(n_estimators=300,max_depth=6)
xg=XGBClassifier(n_estimators=300,max_depth=6)
cross_val_score(rf,X,y,cv=10,n_jobs=-1,scoring="roc_auc").mean()

0.8824362674362675

In [140]:
cross_val_score(xg,X,y,cv=10,n_jobs=-1,scoring="roc_auc").mean()

0.8788117024587614

In [141]:
cross_val_score(rf,X,y,cv=10,n_jobs=-1,scoring="accuracy").mean()

0.8395255930087391

In [142]:
cross_val_score(xg,X,y,cv=10,n_jobs=-1,scoring="accuracy").mean()

0.8305992509363296

In [116]:
rf.fit(X,y)

In [117]:
pd.Series(rf.feature_importances_,index=X.columns).sort_values(ascending=False)

new_title_Mr                0.165016
sex                         0.114775
new_age_pclass              0.098898
new_sex_cat_maturefemale    0.074149
fare                        0.073003
pclass                      0.072316
name_letter_count           0.059474
age                         0.055245
new_title_Miss              0.045460
new_sex_cat_maturemale      0.040447
new_cabin                   0.039839
new_title_Mrs               0.032663
new_family_size             0.031644
name_word_count             0.023078
embarked_S                  0.013058
parch                       0.010582
is_alone                    0.009604
embarked_C                  0.009033
sibsp                       0.008319
new_age_cat_mature          0.008011
new_sex_cat_youngmale       0.007286
new_sex_cat_seniormale      0.005082
new_age_cat_senior          0.003017
dtype: float64

In [118]:
df

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,new_cabin,name_letter_count,name_word_count,new_name_dr,new_title,new_family_size,new_age_pclass,is_alone,new_age_cat,new_sex_cat
0,0.0,3.0,1,22.000000,1.0,0.0,7.2500,S,0,23,4,0,Mr,2.0,66.000000,0.0,mature,maturemale
1,1.0,1.0,0,38.000000,1.0,0.0,71.2833,C,1,51,7,0,Mrs,2.0,38.000000,0.0,mature,maturefemale
2,1.0,3.0,0,26.000000,0.0,0.0,7.9250,S,0,22,3,0,Miss,1.0,78.000000,1.0,mature,maturefemale
3,1.0,1.0,0,35.000000,1.0,0.0,53.1000,S,1,44,7,0,Mrs,2.0,35.000000,0.0,mature,maturefemale
4,0.0,3.0,1,35.000000,0.0,0.0,8.0500,S,0,24,4,0,Mr,1.0,105.000000,1.0,mature,maturemale
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,2.0,1,27.000000,0.0,0.0,13.0000,S,0,21,3,0,Rev,1.0,54.000000,1.0,mature,maturemale
887,1.0,1.0,0,19.000000,0.0,0.0,30.0000,S,1,28,4,0,Miss,1.0,19.000000,1.0,mature,youngfemale
888,0.0,3.0,0,11.462794,1.0,2.0,23.4500,S,0,40,5,0,Miss,4.0,34.388383,0.0,young,youngfemale
889,1.0,1.0,1,26.000000,0.0,0.0,30.0000,C,1,21,4,0,Mr,1.0,26.000000,1.0,mature,maturemale


In [119]:
dff=df.iloc[:,:9]
ohe = OneHotEncoder(variables="embarked",drop_last=True)
dff = ohe.fit_transform(dff)

In [120]:
dff

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,new_cabin,embarked_S,embarked_C
0,0.0,3.0,1,22.000000,1.0,0.0,7.2500,0,1,0
1,1.0,1.0,0,38.000000,1.0,0.0,71.2833,1,0,1
2,1.0,3.0,0,26.000000,0.0,0.0,7.9250,0,1,0
3,1.0,1.0,0,35.000000,1.0,0.0,53.1000,1,1,0
4,0.0,3.0,1,35.000000,0.0,0.0,8.0500,0,1,0
...,...,...,...,...,...,...,...,...,...,...
886,0.0,2.0,1,27.000000,0.0,0.0,13.0000,0,1,0
887,1.0,1.0,0,19.000000,0.0,0.0,30.0000,1,1,0
888,0.0,3.0,0,11.462794,1.0,2.0,23.4500,0,1,0
889,1.0,1.0,1,26.000000,0.0,0.0,30.0000,1,0,1


In [131]:
X = dff.drop("survived",axis=1)
y = dff["survived"]
cross_val_score(rf,X,y,cv=10,n_jobs=-1,scoring="roc_auc").mean()

0.8781229946524064

In [132]:
cross_val_score(rf,X,y,cv=10,n_jobs=-1,scoring="accuracy").mean()

0.827191011235955

In [133]:
cross_val_score(xg,X,y,cv=10,n_jobs=-1,scoring="roc_auc").mean()

0.8778072319837026

In [134]:
cross_val_score(xg,X,y,cv=10,n_jobs=-1,scoring="accuracy").mean()

0.8305493133583021

In [125]:
rf.fit(X,y)

In [126]:
pd.Series(rf.feature_importances_,index=X.columns).sort_values(ascending=False)

sex           0.401752
fare          0.162860
age           0.157545
pclass        0.110532
new_cabin     0.062529
sibsp         0.043789
parch         0.031372
embarked_S    0.016848
embarked_C    0.012773
dtype: float64