# 0.0 Imputation

### fill_na
    df['Age'] = df['Age'].fillna(df.Age.mean())
    df['Color'] = df['Color'].fillna(df.Color.mode()[0])

### machine_learning
    from sklearn.impute import KNNImputer
    X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
    df = pd.DataFrame(X, columns=['feat_1', 'feat_2', 'feat_3'])
    
    imputer = KNNImputer(n_neighbors=3)
    imputer.fit_transform(df)

# 0.1 Feature Selection


### Forward (Sequential Feature Selection)
    from sklearn.feature_selection import SequentialFeatureSelector
    knn = KNeighborsClassifier(n_neighbors=3)
    sfs = SequentialFeatureSelector(knn, n_features_to_select=.8)
    sfs.fit(X_train, y_train)
    X_train.columns[sfs.get_support()]


### Backward (Recursive Feature Elimination)
    from sklearn.feature_selection import RFE, RFECV
    from sklearn.linear_model import LogisticRegression

    estimator = LogisticRegression()
    selector = RFE(estimator, n_features_to_select=7, step=1)
    selector.fit(X_train, y_train)

    X_train.columns[selector.get_support()]

# 0.2 Sampling


### Over sampling
    from imblearn.over_sampling import SMOTE

    oversampler = SMOTE(sampling_strategy=0.5, k_neighbors=6)
    X_over, y_over = oversampler.fit_resample(X_train, y)


# 1 Log Reg

## Params
    param_grid = {
        'C': [0.1, 0.001, 0.0001, 1, 10, 100],
        'penalty': ['l1', 'l2'],
    }

## Scorers
    scorers = {
        'precision_score': make_scorer(precision_score),
        'recall_score': make_scorer(recall_score),
        'accuracy_score': make_scorer(accuracy_score),
    }

## Hyper Param Search
    clf_grid = GridSearchCV(LogisticRegression(solver="liblinear"), param_grid, refit = True, verbose = 3)


# 2 SVM

## Params
    param_grid = {
        'C': [0.1, 0.001, 0.0001, 10, 100, 1000],
        'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 10, 100],
        'kernel': ['rbf']
    }

## Scorers
    scorers = {
        'precision_score': make_scorer(precision_score),
        'recall_score': make_scorer(recall_score),
        'accuracy_score': make_scorer(accuracy_score),
    }

## Hyper Param Search
    clf_grid = GridSearchCV(SVC(), param_grid, scoring=scorers, cv=10, refit='precision_score', verbose = 3)

## Time Series
    RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=SVR(),
                   param_distributions={'C': [0.1, 1, 10, 100, 1000],
                                        'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                                        'kernel': ['rbf']})

# 3 Trees and Forests


## Params
    param_grid = {
        'max_depth': range(5,20),
        'min_samples_split': np.arange(0.1,.6,.1),
        'min_samples_leaf': np.arange(0.1,.6,.1)
    }

## Scorers
    scorers = {
        'precision_score': make_scorer(precision_score),
        'recall_score': make_scorer(recall_score),
        'accuracy_score': make_scorer(accuracy_score),
    }

## Hyper Param Search

    from sklearn.tree import DecisionTreeRegressor
    from sklearn import tree
    clf_grid = GridSearchCV(SVC(), param_grid, scoring=scorers, cv=10, refit='precision_score', verbose = 3)

## Plotting Trees

    regressor = DecisionTreeRegressor(max_depth=5,random_state=0)
    regressor.fit(X_train.head(100), y_train.head(100)) # doing this for visualization purposes only
    print(f"R-squared {regressor.score(X_train.head(100), y_train.head(100))}")
    fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (15,15), dpi=500)
    tree.plot_tree(regressor, feature_names=X_train.columns, filled=True);

## Time Series
    from sklearn.model_selection import RandomizedSearchCV
    
    param_grid = {
        'max_depth': range(5,20),
        'min_samples_split': np.arange(0.1,.6,.1),
        'min_samples_leaf': np.arange(0.1,.6,.1)
    }
    
    tss = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
    regressor = DecisionTreeRegressor()
    reg = RandomizedSearchCV(regressor, param_grid, cv=tss, n_iter=100)
    reg.fit(X_train, y_train)

# 4 Ensembles


## Stacking
    from sklearn.ensemble import VotingClassifier
    from sklearn.ensemble import VotingRegressor

    # WHERE lr_model and knn_model have been trained.
    
    eclf = VotingClassifier([('lr', lr_model), ('knn', knn_model)])
    eclf.fit(X_train, y_train)

## Grid Searched Stacking
    param_grid = {
        'lr__C': [0.1, 1, 10],
        "lr__penalty": ["l1", "l2"], 
        "lr__solver": ["liblinear"],
        "knn__n_neighbors": range(5,20)
    }
    
    eclf = VotingClassifier(estimators=[('lr', LogisticRegression()), ('knn', KNeighborsClassifier())])
    eclf_grid = GridSearchCV(eclf, param_grid)
    eclf_grid.fit(X_train, y_train)

## Random Forest Bagging
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import RandomForestRegressor

    param_grid = {
        "n_estimators": range(100, 150, 10),
        "max_depth": range(5,50,5),
        "min_samples_split": np.arange(.1,.6,.1),
        "min_samples_leaf": np.arange(.1,.6,.1),
        "max_features": np.arange(0.1, 0.6, .1)
    }
    
    clf = RandomForestClassifier()
    clf_grid = RandomizedSearchCV(clf, param_grid, n_iter=25)
    clf_grid.fit(X_train, y_train)

### Plotting Feature Importance
    feat_df = pd.DataFrame()
    feat_df['feature'] = X_train.columns
    feat_df['importance'] = clf_grid.best_estimator_.feature_importances_
    feat_df = feat_df.sort_values('importance', ascending=False)
    sns.barplot(y=feat_df['feature'], 
                x=feat_df['importance'])

## XGB Boosting
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)

    params = {
        'n_estimators': range(100,500, 50),
        'max_depth': range(5,50, 5),
        'learning_rate': [0.1, 0.001, 0.00001, 1],
        'gamma': np.arange(0.5, 2, .2),
        'reg_alpha': [0, 0.5, 1],
        'reg_lambda': [1, 1.5, 2, 3, 4.5]
    }
    
    clf = xgb.XGBClassifier(objective='binary:logistic', eval_metric="logloss", use_label_encoder=False)
    clf_grid = RandomizedSearchCV(clf, params, n_iter=10)
    clf_grid.fit(X_train, y_train)    

# 5 Nerual Networks

    from sklearn.neural_network import MLPClassifier


# Pipelines
    from sklearn.pipeline import Pipeline 
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import OneHotEncoder

### Imputation
    numeric_imputer = Pipeline(steps=[
            ('impute', SimpleImputer(strategy="mean"))
    ])
    
    categorical_imputer = Pipeline(steps=[
            ('impute', SimpleImputer(strategy="most_frequent")),
            ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])

#### Sample Imputation
    # Call fit then transform
    numeric_imputer.fit(X_train[cols])
    numeric_imputer.transform(X_train[cols])

### Combining Different Imputations
    from sklearn.compose import ColumnTransformer
    
    numerical_features = X_train.select_dtypes(include='number').columns.tolist()
    categorical_features = X_train.select_dtypes(include='object').columns.tolist()
    
    print(numerical_features)
    print(categorical_features)
    
    num_cat_imputer = ColumnTransformer(transformers=[
        ('numeric', numeric_imputer, numerical_features),
        ('categorical', categorical_imputer, categorical_features)
    ])

### Pipeline Making
    log_reg = LogisticRegression()
    
    logreg_pipeline = Pipeline(steps=[
        ('preprocess', num_cat_imputer),
        ('model', log_reg)
    ])
    
    logreg_pipeline.fit(X_train, y_train)


### Multiple Models in Pipeline
    from sklearn.ensemble import RandomForestClassifier
    
    param_dict = [
        {
            'classifier': [LogisticRegression()],
            'classifier__C': [0.1, 0.001, 1],
            'classifier__penalty': ['l1', 'l2'],
            'classifier__solver': ['liblinear']     
        },
        {
            'classifier': [RandomForestClassifier()],
            'classifier__n_estimators': range(100, 200, 25),
            'classifier__max_depth': range(10,50,10),
            'classifier__min_samples_split': np.arange(0.1,.6,.15),
            'classifier__min_samples_leaf': np.arange(0.1,.6,.15),
            'classifier__max_features': np.arange(0.1,.6,.15)
        }
    ]
    
    pipeline = Pipeline(steps=[
        ('preprocess', num_cat_imputer),
        ('classifier', LogisticRegression())
    ])
    
    search = GridSearchCV(pipeline, param_dict, cv=10, verbose=0)
    search.fit(X_train, y_train)

# Search Feature Importance Plotting

    cols = X_train.columns.to_list()
        fig, ax = plt.subplots(figsize=(15,10))
        sns.barplot(x=clf_grid.best_estimator_.coef_[0], y=cols)
        
        for p in ax.patches:
            width = p.get_width()    # get bar length
            ax.text(width - .5,       # set the text at 1 unit right of the bar
                    p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
                    '{:1.2f}'.format(width), # set variable to display, 2 decimals
                    ha = 'left',   # horizontal alignment
                    va = 'center',
                    fontsize=13)  # vertical alignmen


# Shap
### Pipeline
    import shap
    explainer = shap.TreeExplainer(search.best_estimator_.steps[1][1], 
                                   search.best_estimator_.steps[0][1].transform(X_train),
                                   feature_perturbation='interventional',
                                   model_output='probability')
    shap_values = explainer.shap_values(search.best_estimator_.steps[0][1].transform(X_test))
    
    shap.summary_plot(shap_values[:,:,0], search.best_estimator_.steps[0][1].transform(X_test))