# Dataframes and scikit-learn

In [None]:
# this cell's metadata contains
# "nbsphinx": "hidden" so it is hidden by nbsphinx
PATH_YIELD_ENGINE = 'src'
def set_paths() -> None:
    import sys
    import os
    
    if 'cwd' not in globals():
        global cwd
        cwd = os.path.join(os.getcwd(), os.pardir, os.pardir, os.pardir)
        os.chdir(cwd)   
    print(f"working dir is '{os.getcwd()}'")                            
    if PATH_YIELD_ENGINE not in sys.path:
        sys.path.insert(0, PATH_YIELD_ENGINE)
    print(f"added `{sys.path[0]}` to python paths")
set_paths()

In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from pandas.api.types import is_numeric_dtype

from gamma.sklearndf.pipeline import PipelineDF
from gamma.sklearndf.transformation import SimpleImputerDF, OneHotEncoderDF, ColumnTransformerDF
from gamma.sklearndf.transformation.extra import BorutaDF
from gamma.sklearndf.regression import RandomForestRegressorDF, LGBMRegressorDF
from gamma.sklearndf.classification import RandomForestClassifierDF

We load our data:

In [None]:
df = pd.read_csv('data/ames-housing-dataset/train.csv')
df = df.drop(['Id', 'YrSold', 'MoSold'], axis=1)
TARGET = "SalePrice"
y = df[TARGET]
df = df.drop(columns=TARGET)

The data contains categorical features and missing values:

In [None]:
df['GarageType'].unique().tolist()

In [None]:
df['GarageType'].isna().sum()

Let us build a preprocessing pipeline which:

- for categorical variables fills missing values with the string 'nan' and then one hot encode
- for numerical values fills missing values using median values

In [None]:
categorical_features = df.select_dtypes([object]).columns
numerical_features = [col for col, dtype in df.dtypes.iteritems() if is_numeric_dtype(dtype)]

## An sklearn pipeline
We first build the preprocessing pipeline with scikit-learn

In [None]:
ohe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='nan')),
    ('ohe', OneHotEncoder(sparse=False))
])

In [None]:
imputer = SimpleImputer(strategy="median")

In [None]:
preprocessing = ColumnTransformer(transformers=
  [('categorical', ohe, categorical_features), ('numerical', imputer, numerical_features)])

In [None]:
transformed_array = preprocessing.fit_transform(df)

In [None]:
transformed_array

## An sklearndf pipeline

In [None]:
ohe_df = PipelineDF(steps=[
    ('imputer', SimpleImputerDF(strategy='constant', fill_value='nan')),
    ('ohe', OneHotEncoderDF(sparse=False, handle_unknown='ignore'))])

In [None]:
imputer_df = SimpleImputerDF(strategy="median")

In [None]:
preprocessing_df = ColumnTransformerDF(transformers=
  [('categorical', ohe_df, categorical_features), ('numerical', imputer_df, numerical_features)])

In [None]:
transformed_df = preprocessing_df.fit_transform(df)

In [None]:
transformed_df.head()

In [None]:
preprocessing_df.columns_original.head()

Hence it is easy to select columns coming from a given feature:

In [None]:
mask = preprocessing_df.columns_original == 'GarageType'
transformed_df.loc[:, mask].head()

## Regressor

Like for transformers, scikit-learn regressors have a sklearndf sibling abotained by appending **DF** to the class name, and the API remains the same:

In [None]:
# define train and test sets. Since sklearn.RandomForest only accepts numercial values,
# we restrict to numercial values for now
df_numerical = df.select_dtypes(include='number').fillna(0)
df_numerical_train, df_numerical_test, y_train, y_test = train_test_split(df_numerical, y, random_state=0)

random_forest_regressor_df = RandomForestRegressorDF(n_estimators=50, random_state=0)
random_forest_regressor_df.fit(df_numerical_train, y_train)
random_forest_regressor_df.score(df_numerical_test, y_test)

In [None]:
random_forest_regressor_df.get_params()

In [None]:
random_forest_regressor_df.set_params(max_depth=10)

In [None]:
random_forest_regressor_df.delegate_estimator

In [None]:
random_forest_regressor_df.columns_in

In [None]:
random_forest_regressor_df.is_fitted

## Classifier

Classifiers follow the same logic:

In [None]:
# we create a classification target for the house whose price is greater than 200 000
y.mean
y_classification = y.apply(lambda x: 1 if x > 200000 else 0)
df_numerical_train, df_numerical_test, y_classification_train, y_classification_test =\
train_test_split(df_numerical, y_classification, random_state=0)

In [None]:
random_forest_classifier_df = RandomForestClassifierDF(n_estimators=50)
random_forest_classifier_df.fit(df_numerical_train, y_classification_train)
random_forest_classifier_df.score(df_numerical_test, y_classification_test)

## Pipeline

In [None]:
ohe_df = PipelineDF(steps=
 [('imputer', SimpleImputerDF(strategy='constant', fill_value='nan')), 
  ('ohe', OneHotEncoderDF(sparse=False, handle_unknown='ignore'))])

imputer_df = SimpleImputerDF(strategy="median")

preprocessing_df = ColumnTransformerDF(transformers=
  [('categorical', ohe_df, categorical_features),('numerical', imputer_df, numerical_features)])

pipeline_df = PipelineDF(steps=
  [('preprocessing', preprocessing_df), ('rf_model', RandomForestRegressorDF(n_estimators=10))])

In [None]:
df_train, df_test, y_train, y_test = train_test_split(df, y, random_state=0)
pipeline_df.fit(df_train, y_train)
pipeline_df.score(df_test, y_test)

## Extra

In [None]:
lgbm_df = LGBMRegressorDF(n_estimators=50, max_depth=10)
lgbm_df.fit(df_numerical_train, y_train)
lgbm_df.score(df_numerical_test, y_test)