# Dataframes and scikit-learn

In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

In [None]:
import pandas as pd
# todo remove RandomForestRegressor when possible
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from pandas.api.types import is_numeric_dtype

from gamma import Sample
from gamma.viz.dendrogram import DendrogramDrawer, FeatMapStyle, LineStyle
from gamma.sklearndf.pipeline import PipelineDF
from gamma.sklearndf.transformation import SimpleImputerDF, MissingIndicatorDF, OneHotEncoderDF, ColumnTransformerDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.sklearndf.classification import RandomForestClassifierDF
from gamma.model.inspection import ModelInspector
from gamma.model.prediction import PredictorFitCV
from gamma.model.selection import ModelPipelineDF, ModelGrid, ModelRanker, summary_report
from gamma.model.validation import CircularCrossValidator
from gamma.yieldengine.simulation import UnivariateSimulator
from gamma.yieldengine.partition import ContinuousRangePartitioning
from gamma.yieldengine.viz import SimulationDrawer, SimulationPlotStyle

In [None]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)
TARGET = "SalePrice"

In [None]:
y = df[TARGET]

In [None]:
df = df.drop(columns=TARGET)

## Transformers and Pipeline

Some features are categorical, like **GarageType** and there are some missing values:

In [None]:
df.GarageType.unique()

In [None]:
df.GarageType.isna().sum()

With scikit-learn to one-hot encode one should first impute missing values, and then one hot encode categorical variables:

In [None]:
categorical_features = df.select_dtypes([object]).columns
numerical_features = [col for col, dtype in df.dtypes.iteritems() if is_numeric_dtype(dtype)]

### With sklearn

In [None]:
ohe_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='nan')),
    ('ohe', OneHotEncoder(sparse=False))
])

In [None]:
imputer = SimpleImputer(strategy="median")

In [None]:
ohe_transformer = ColumnTransformer(
    transformers=[('categorical', ohe_pipeline, categorical_features), 
                  ('numerical', imputer, numerical_features)])

In [None]:
array_transformed = ohe_transformer.fit_transform(df)

In [None]:
array_transformed

In [None]:
array_transformed.shape

### With sklearndf

In [None]:
ohe_pipeline_df = PipelineDF(steps=[
    ('imputer', SimpleImputerDF(strategy='constant', fill_value='nan')),
    ('ohe', OneHotEncoderDF(sparse=False, handle_unknown='ignore'))
])

In [None]:
imputer_df = SimpleImputerDF(strategy="median")

In [None]:
ohe_transformer_df = ColumnTransformerDF(
    transformers=[('categorical', ohe_pipeline_df, categorical_features),
                  ('numerical', imputer_df, numerical_features)])

In [None]:
transformed_df = ohe_transformer_df.fit_transform(df)

In [None]:
transformed_df.head()

In [None]:
ohe_transformer_df.columns_original.head()

## Regressor

TODO

In [None]:
random_forest_regressor_df = RandomForestRegressorDF(n_estimators=50)

In [None]:
df_numerical = df.select_dtypes(include='number').fillna(0)
df_numerical_train, df_numerical_test, y_train, y_test = train_test_split(df_numerical, y)

In [None]:
random_forest_regressor_df.fit(df_numerical_train, y_train)

In [None]:
random_forest_regressor_df.score(df_numerical_test, y_test)

In [None]:
random_forest_regressor_df.get_params()

In [None]:
random_forest_regressor_df.set_params(max_depth=10)

In [None]:
random_forest_regressor_df.delegate_estimator

In [None]:
random_forest_regressor_df.columns_in

In [None]:
random_forest_regressor_df.is_fitted

## Classifier

TODO

In [None]:
y_bin = y.apply(lambda x: 0 if x < y.median() else 1)

In [None]:
random_forest_classifier_df = RandomForestClassifierDF(n_estimators=50)

In [None]:
random_forest_classifier_df.fit(df_numerical, y_bin)

In [None]:
random_forest_classifier_df.score(df_numerical, y_bin)

In [None]:
random_forest_classifier_df.get_params()

In [None]:
random_forest_classifier_df.set_params(max_depth=10)

In [None]:
random_forest_classifier_df.delegate_estimator

In [None]:
random_forest_classifier_df.columns_in

In [None]:
random_forest_classifier_df.is_fitted

## Pipeline

TODO

In [None]:
ohe_pipeline_df = PipelineDF(steps=[
    ('imputer', SimpleImputerDF(strategy='constant', fill_value='nan')),
    ('ohe', OneHotEncoderDF(sparse=False, handle_unknown='ignore'))
])

In [None]:
imputer_df = SimpleImputerDF(strategy="median")

In [None]:
ohe_transformer_df = ColumnTransformerDF(
    transformers=[('categorical', ohe_pipeline_df, categorical_features),
                  ('numerical', imputer_df, numerical_features)])

In [None]:
full_pipeline_df = PipelineDF(steps=[
    ('preprocessing', ohe_transformer_df),
    ('rf_model', RandomForestRegressorDF(n_estimators=10))
])

In [None]:
df_train, df_test, y_train, y_test = train_test_split(df, y)

In [None]:
ohe_transformer_df.fit(df_train)

In [None]:
ohe_transformer_df.transform(df_test).head()

In [None]:
full_pipeline_df.fit(df_train, y_train)

In [None]:
full_pipeline_df.predict(df_test).head()

In [None]:
full_pipeline_df.score(df_test, y_test)

## Extra

TODO
Boruta, LGBM...