# Import libs

In [1]:
import pandas as pd
import numpy as np 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import sys

sys.path.append('..')
from eFlow.ClusterMaster import *
from eFlow.DataFrameTypes import *
from eFlow.Analysis.DataAnalysis import *
from eFlow.PipelineSegments.DataCleaner import *
from xgboost import XGBClassifier
import ipython_blocking
from pivottablejs import pivot_ui
import scikitplot as skplt

In [None]:
%matplotlib notebook
%matplotlib inline

### Be sure to run the following

## Declare Worflow Variables

#### (This should be the only place you should have to declare anything)

In [None]:
dataset_path = "Datasets/titanic_train.csv"
target_column = "Survived"
parent_project_name = "Pre processing"
prediction_method = "Classification"
notebook_mode = True

# Import dataset

In [None]:
df = pd.read_csv(dataset_path)
display(df.shape)
df.head()

## Interaction tool for dataframes

### Explore the data

In [None]:
pivot_ui(df,
         outfile_path='Piviot_Table_JS.html')

In [None]:
# w = widgets.SelectMultiple(
#     options=['Apples', 'Oranges', 'Pears'],
#     value=['Oranges'],
#     #rows=10,
#     description='Fruits',
#     disabled=False
# )
# del w

In [None]:
a = str(u"\u2192")
print(a)

## Remove Un-Wanted Columns

### Do not remove nans yet, let the datacleaner do it's job

In [None]:
df.drop(columns=["Name",
                 "Ticket",
                 "PassengerId"],
        inplace=True)
df.head()

In [None]:
from dateutil import parser
dt = parser.parse("Aug 28 1999 12:00AM")

In [None]:
df["Date_test"] = ["2019-01-02" for _ in range(0,df.shape[0])]
df["Date_test"][0] = np.nan
# df["Date_test"] = [parser.parse(val)for val in df["Date_test"].value_counts().keys()]

## Basic Feature manipulation

### Change cabin column to have the level on the ship

In [None]:
df["Cabin"] = df["Cabin"].str.replace(r'\d+', '').str[0]

## Change Feature Data types

### Look at data types

In [None]:
df.dtypes

### Make given data type changes

In [None]:
# df["Pclass"] = df["Pclass"].replace(1, np.nan)

### Final look at data types

In [None]:
df.dtypes

## Set up DataFrameTypes

In [None]:
df_features = DataFrameTypes(df,
                             target_column=target_column,
                             ignore_nulls=True)

## Skim through Value Counts

In [None]:
for col in df.columns:
    if col not in df_features.get_float_features() and len(np.unique(df[col].dropna().values)) <= 12:
        display(df[col].value_counts())
        print("***" * 4 + "\n\n")

### Perform quick analysis

In [None]:
analysis_obj = DataAnalysis(df,
                            df_features,
                            project_name=parent_project_name + "/" + "General Analysis (Before Cleaning)",
                            missing_data_visuals=False)

## Data Cleaning

In [None]:
data_cleaner = DataCleaner(df,
                           project_name=parent_project_name + "/" + "Data Cleaning",
                           missing_data_visuals=True)

In [None]:
data_cleaner.data_cleaning_widget(df,
                                  df_features)

In [None]:
data_cleaner.get_last_saved_json_file_path()

In [None]:
data_cleaner.data_cleaning_with_json_file(df,
                                          data_cleaner.get_last_saved_json_file_path())

In [None]:
from scipy import stats
df.reset_index(drop=True)
z_score_return = stats.zscore(((df["Age"].dropna())))
df["Age"].dropna()[(z_score_return >= -2) & (z_score_return <= 2)]

In [None]:
from impyute.imputation.cs import mice

a = df["Age"].tolist()
# start the MICE training
imputed_training=mice(df.values)

In [None]:
import datawig

df_train, df_test = datawig.utils.random_split(df)

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Fare', 'Cabin','Embarked'], # column(s) containing information about the column we want to impute
    output_column= 'Age', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df, num_epochs=200)

In [None]:
imputed_training=mice(df[df_features.get_numerical_features()].values)
imputed_training[0]

In [None]:
# !pip install datawig
# !pip install opencv-python
# !pip install Pillow
# !pip install tesserocr

In [None]:
import random
test = df.drop(columns=["Sex", "Date_test", "Embarked", "Cabin"]).dropna()

test[target_column] = [random.randint(0, 40) for _ in range(0,test.shape[0])]
print(len(test[target_column]))

y = test[target_column].values
X = test.values

print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=5187
)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from eFlow.ToolBox.Modeling import *
# Find best parameters for model
param_grid = {
    "max_depth": list(range(2, 8)),
    "min_samples_leaf": list(range(1, 35, 5)),
    "criterion": ["gini", "entropy"],
#     "n_splits": [10]
}

model, best_params = optimize_model_grid(
    model=DecisionTreeClassifier(),
    X_train=X_train, y_train=y_train,
    param_grid=param_grid
)
model

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, model.predict(X_test))

In [None]:
from eFlow._Hidden.Objects.enum import enum
from eFlow.Utils.SysUtils import *
from eFlow.Utils.Constants import *

class SupervisedAnalysis:

    def __init__(self,
                 model,
                 X_train,
                 X_test,
                 y_train,
                 y_test,
                 model_name,
                 prediction_type="Classification",
                 project_name="Supervised Analysis",
                 overwrite_full_path=None,
                 notebook_mode=True,
                 normalize_confusion_matrix=True):
        
        # Init objectss by pass by refrence
#         self.__X_train = X_train
#         self.__X_test = X_test
#         self.__y_train = y_train
#         self.__y_test = y_test
        self.__model = model
        self.__model_name = copy.deepcopy(model_name)
        self.__notebook_mode = notebook_mode
        
#         skplt.metrics.plot_roc(y_train, y_train)

        # Setup project structure
        if not overwrite_full_path:
            parent_structure = "/" + SYS_CONSTANTS.PARENT_OUTPUT_FOLDER_NAME \
                               + "/" + project_name + "/"
            self.__PROJECT = enum(PATH_TO_OUTPUT_FOLDER=
                                  os.getcwd() + parent_structure)
        else:
            self.__PROJECT = enum(PATH_TO_OUTPUT_FOLDER=overwrite_full_path)
        
        if prediction_type == "Classifcation":
            
            self.confusion_matrix(X_train,
                                  y_train,
                                  title="Confusion Matrix: Training Data",
                                  normalize=normalize_confusion_matrix)
            
            self.confusion_matrix(X_test,
                                  y_test,
                                  title="Confusion Matrix: Testing Data",
                                  normalize=normalize_confusion_matrix)
            
            if len(set(y_train) | set(y_test)) == 2:
                self.__binary_classification = True
            else:
                self.__binary_classification = False

            try:
                y_probas = model.predict_proba(X_test)
                skplt.metrics.plot_roc(y_test, y_probas,figsize=(10,8))
                
                if self.__binary_classification:
                    skplt.metrics.plot_ks_statistic(y_test, y_probas,figsize=(10,8))
            except AttributeError:
                pass
            
        elif prediction_type == "Regression":
            pass
        else:
            print("ERROR")
        
    def confusion_matrix(self,
                         X,
                         y,
                         figsize=(10,8),
                         title=None,
                         filename=None,
                         normalize=True):
        
        if title:
            skplt.metrics.plot_confusion_matrix(self.__model.predict(X),
                                                y,
                                                figsize=figsize,
                                                title=title,
                                                normalize=normalize)
        else:
            skplt.metrics.plot_confusion_matrix(self.__model.predict(X),
                                                y,
                                                figsize=figsize,
                                                normalize=normalize)
        if filename:
            create_plt_png(self.__PROJECT.PATH_TO_OUTPUT_FOLDER,
                           self.__model_name,
                           filename)
        elif filename is None and title:
            create_plt_png(self.__PROJECT.PATH_TO_OUTPUT_FOLDER,
                           self.__model_name,
                           convert_to_file_name(title.replace(":", "").replace(" ", "_")))
        else:
            create_plt_png(self.__PROJECT.PATH_TO_OUTPUT_FOLDER,
                           self.__model_name,
                           "Confusion_Matrix")
            
        if self.__notebook_mode:
            plt.show()
        plt.close()

SupervisedAnalysis(model=model,
                   model_name=repr(model).split("(")[0],
                   X_train=X_train,
                   X_test=X_test,
                   y_train=y_train,
                   y_test=y_test,
                   project_name=parent_project_name + "/" + "SupervisedAnalysis",
                   prediction_type="Classifcation")

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from collections import defaultdict

In [None]:
report = pd.DataFrame(classification_report(y_test,
                                            model.predict(X_test),output_dict=True))
report 

In [None]:
type(df.values)

In [None]:
df.values

In [None]:
np.unique(df.values[:,0])