In [1]:
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import sys

sys.path.append('..')
from Libraries.Utils.Global_Utils import *
from Libraries.ClusterMaster import *
from Libraries.DataFrameTypes import *
from Libraries.DataAnalysis import *
from Libraries.DataCleaner import *
from xgboost import XGBClassifier
import ipython_blocking 

In [2]:
%matplotlib notebook
%matplotlib inline

# Set-Up Phase

## Import dataset

In [3]:
df = pd.read_csv("Datasets/titanic_train.csv")
display(df.shape)
df.head()

(891, 12)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Remove Un-Wanted Columns

In [4]:
df.drop(columns=['Name',"Ticket", "PassengerId"],
        inplace=True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


## Basic Feature manipulation

### Change cabin column to have the level on the ship

In [5]:
df["Cabin"] = df["Cabin"].str.replace(r'\d+', '').str[0]

## Change Feature Data types

### Look at data types

In [6]:
df.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked     object
dtype: object

### Make given data type changes

In [7]:
df["Survived"] = df["Survived"].astype(bool)

### Final look at data types

In [8]:
df.dtypes

Survived       bool
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked     object
dtype: object

## Set up DataFrameTypes

In [9]:
df_features = DataFrameTypes(df,
                             target_col="Sex")

Categorical Features: {'Cabin', 'Sex', 'Embarked'}

Bool Features: {'Survived'}

Possible One hot encoded feature names: {'Cabin': ['Cabin_E', 'Cabin_G', 'Cabin_D', 'Cabin_F', 'Cabin_T', 'Cabin_B', 'Cabin_A', 'Cabin_C'], 'Sex': ['Sex_male', 'Sex_female'], 'Embarked': ['Embarked_Q', 'Embarked_S', 'Embarked_C']}

------------------------------------------------------------------------------------------
Numerical Features: {'Pclass', 'SibSp', 'Parch', 'Fare', 'Age'}

Integer Features: {'Pclass', 'SibSp', 'Parch'}

Float Features: {'Fare', 'Age'}

Target Feature: Sex



### Skim through value_counts

In [10]:
for col in df.columns:
    if col not in df_features.get_float_features() or len(np.unique(df[col].dropna().values)) <= 25:
        display(df[col].value_counts())
        print("***" * 4 + "\n\n")

False    549
True     342
Name: Survived, dtype: int64

************




3    491
1    216
2    184
Name: Pclass, dtype: int64

************




male      577
female    314
Name: Sex, dtype: int64

************




0    608
1    209
2     28
4     18
3     16
8      7
5      5
Name: SibSp, dtype: int64

************




0    678
1    118
2     80
5      5
3      5
4      4
6      1
Name: Parch, dtype: int64

************




C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64

************




S    644
C    168
Q     77
Name: Embarked, dtype: int64

************




### Perform quick analysis

In [11]:
# analysis_obj = DataAnalysis(df,
#                             df_features,
#                             project_name="General Analysis")

### ERIC: WEEKEND PROJECT FINISH DATACLEANER!
6-7 HOURS

In [12]:
file = open("Production Data/General Analysis/Feature_Analysis/Graphics/Distance_Plot_Age.png", "rb")
image = file.read()
widgets.Image(
    value=image,
    format='png',
    width=700,
    height=560,
)

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x02\xd1\x00\x00\x01\xf0\x08\x06\x00\x00\x00`\xd13\t\…

In [13]:
data_cleaner = DataCleaner(df,
                           df_features,
                           project_name="General Analysis",
                           missing_data_graphing=False)

In [14]:
submit_button,data_cleaning_df = data_cleaner.data_cleaning_widget(df,
                                                                   df_features)

interactive(children=(Select(description='Features', options=('Age', 'Cabin', 'Embarked'), value='Age'), Selec…

Button(description='Run', style=ButtonStyle())

In [15]:
if True:
    %block submit_button

In [16]:
df.drop(columns=["Cabin"],
        inplace=True)

In [17]:
df_features = DataframeTypeHolder(df,
                                  target_col="Survived")

NameError: name 'DataframeTypeHolder' is not defined

In [None]:
df_features.get_all_features()

In [None]:
%matplotlib notebook
%matplotlib inline

In [None]:
df, le_map = encode_df(df,
                       df_features)
df.head()

In [None]:
DataAnalysis(df)

In [None]:
df[df_features.get_target()].value_counts()

In [None]:
print_encoder_map(le_map)

In [None]:
X = np.array(df.drop(df_features.get_target(),
                     axis=1,
                     inplace=False))
y = np.array(df[df_features.get_target()])

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                    test_size=0.30,
                                                    random_state=42,
                                                    stratify=y)
X = None
y = None

In [None]:
display(X_train.shape)
display(y_train.shape)
display(y_test.shape)
display(X_test.shape)

In [None]:
params = {
    'objective':['binary:logistic'],
    'learning_rate':[.001,.01,.1,.15],
    'min_child_weight': [5,6,7,8,9,10],
    'gamma': [.15,.2,.1,.05,.01,.2,.25,.3],
    'subsample': [0.1,0.2,.05,.15,.02,.01],
    'colsample_bytree': [0.6,0.2,0.1,.05],
    'max_depth': [2,3,4,5,6,7],
    'n_estimators': [50,100,200,300,500,600,700],
}
best_xgb,xgb_best_parms = optimize_model_grid(
    model=XGBClassifier(),
    X_train=X_train,
    y_train=y_train,
    cv=5,
    param_grid=params,
    n_jobs=9,
)

In [None]:

# Tuned Parameters: {'colsample_bytree': 0.6, 'gamma': 0.15, 'learning_rate': 0.001, 'max_depth': 2, 'min_child_weight': 5, 'n_estimators': 50, 'objective': 'binary:logistic', 'subsample': 0.1}

In [None]:
df_features.get_bool_features()

In [None]:
df = pd.read_csv("Datasets/testing.csv")
display(df.shape)
df.head()

In [None]:
df["broker_id"].value_counts()