# Notebook example

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd

from src.data_source.spreadsheet import Spreadsheet
from src.preprocessing import Preprocessing
from src.model import TrainerSklearn

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [None]:
df = Spreadsheet().get_data('../data/raw/train.csv')

In [None]:
df.columns

In [None]:
p = Preprocessing()

In [None]:
df = p.clean_data(df)
df = p.categ_encoding(df)

In [None]:
df.head()

In [None]:
X = df.drop(columns=["Survived"])
y = df["Survived"]

In [None]:
# Ensure the same random state passed to TrainerSkleran().train()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
rf = TrainerSklearn().train(X, y, classification=True, 
                            algorithm=RandomForestClassifier, 
                            preprocessing=p,
                           data_split=('train_test', {'test_size':.3}),
                           random_state=123)

In [None]:
rf.get_metrics()

In [None]:
rf.get_columns()

In [None]:
rf.predict_proba(X_test, binary=True)

In [None]:
# Predicting new data
def predict_new(X, model, probs=True):
    X = p.clean_data(X)
    X = p.categ_encoding(X)
    
    columns = model.get_columns()
    for col in columns:
        if col not in X.columns:
            X[col] = 0
    print(X)
    if probs:
        return model.predict_proba(X)
    else:
        return model.predict(X)

In [None]:
new_data = pd.DataFrame({
    'Pclass':3,
    'Sex': 'male',
    'Age':4
}, index=[0])

new_data

In [None]:
predict_new(new_data, rf)

**Get local explainer for each instance:**

In [None]:
# Get local explainer
res = rf.local_interpret(X_test, len(X_test.columns))

In [None]:
res

**Data Quality:**

In [None]:
from src.preprocessing import DataDrift
import great_expectations as ge

In [None]:
df = Spreadsheet().get_data('../data/raw/train.csv')

In [None]:
X_train, X_test = train_test_split(df, test_size=0.3, random_state=123)
X_train.shape, X_test.shape

In [None]:
dq = DataDrift(discrete_cat_cols=['Sex', 'Pclass'])
df_ge = dq.check(X_train, target='Survived')

In [None]:
df_ge.save_expectation_suite('../data/output/expectations.json')

In [None]:
X_test.drop(columns=['Survived'], inplace=True)
df_ge = ge.dataset.PandasDataset(X_test)
ge_val = df_ge.validate(expectation_suite='../data/output/expectations.json', only_return_failures=False)

In [None]:
ge_val

**Hypothesis testing**

In [None]:
import pandas as pd
from src.analysis import HTestAutoPilot, HypothesisTester

In [None]:
df = Spreadsheet().get_data('../data/raw/train.csv')

In [None]:
survived_age = df[(df.Survived == 1) & (df.Age.notnull())].Age.values
not_survived_age = df[(df.Survived == 0)  & (df.Age.notnull())].Age.values

In [None]:
# If you want to compare distributions, but you are not sure if the data satisfy the parametric test conditions, 
# you can call the HTestAutoPilot Class.
# It will help you in this decision.
HTestAutoPilot.independent_difference(survived_age, not_survived_age, label1='Survived', label2='Not Survived')

In [None]:
# If you know which test to use, you can call it directly from the HypothesisTester Class
HypothesisTester.t_test(survived_age, not_survived_age, show_graph=False)

**Dimensionality Reduction**

In [None]:
import pandas as pd
import numpy as np
from src.analysis import DimensionalityReducer

df = pd.DataFrame({'Col_1': [-1, -2, -3, 1, 2, 3], 'Col_2': [-1, -1, -2, 1, 1, 2], 'Col_3': [-5, -4, -3, 2, 1, 1]})

import pandas as pd
import numpy as np
from src.analysis import DimensionalityReducer

In [None]:
import pandas as pd
import numpy as np
from src.analysis import DimensionalityReducer

In [None]:
df = pd.DataFrame({'Col_1': [-1, -2, -3, 1, 2, 3], 'Col_2': [-1, -1, -2, 1, 1, 2], 'Col_3': [-5, -4, -3, 2, 1, 1]})

In [None]:
df

In [None]:
dm = DimensionalityReducer(reducer='pca', columns=['Col_1', 'Col_2'], k=1)

In [None]:
dm.fit_transform(df, y='Col_3')