In [1]:
def missingness_summary(df, print_log=False, sort='none'):
    summary = df.apply(lambda x: x.isna().sum() / x.shape[0])
    
    if print_log == True:
        if sort == 'none':
            print(summary)
        elif sort == 'ascending':
            print(summary.sort_values())
        elif sort == 'descending':
            print(summary.sort_values(ascending=False))
        else:
            print('Invalid value for sort parameter.')
        
    return summary

In [1]:
def print_vif(x):
    import warnings
    import statsmodels.api as sm
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

In [None]:
# not a function, but this is handy
cat_cols = ["season", "weather", "weekday", "hour"]
drop_cats = [1, 1, "Saturday", 0]

num_cols = ["temp", "humidity", "windspeed"]

bin_cols = ["holiday"]

ct = ColumnTransformer(
    [
        ("one_hot_encode", OneHotEncoder(drop=drop_cats, sparse=False), cat_cols),
        ("scale", StandardScaler(), num_cols),
    ],
    remainder="passthrough",
)

ct.fit(X_train)

X_train_trans = ct.transform(X_train)
X_test_trans = ct.transform(X_test)

cat_names = ct.transformers_[0][1].get_feature_names(cat_cols)
cat_names = list(cat_names)
new_col_names = cat_names + num_cols + bin_cols

X_train = pd.DataFrame(X_train_trans, columns=new_col_names)
X_test = pd.DataFrame(X_test_trans, columns=new_col_names)

In [1]:
def eval_preds(y_true, y_pred):
    error = y_true - y_pred

    rmse = np.sqrt((error ** 2).mean())
    mae = error.abs().mean()
    mape = (error / y_true).abs().mean()

    print(f"rmse {rmse:.2f}")
    print(f"mae {mae:.2f}")
    print(f"mape {mape:.2f}")

    line_pts = [y_true.min(), y_true.max()]
    plt.scatter(y_true, y_pred)
    plt.plot(line_pts, line_pts, c="red", ls="--", alpha=0.5)
    plt.xlabel("Actual")
    plt.ylabel("Fit")
    plt.show()

In [None]:
y_prob = model.predict_proba(X_test)[:, 1]
auc = roc_auc_score(y_test, y_prob)
fpr, tpr, _ = roc_curve(y_test, y_prob)

plt.plot(fpr, tpr)
plt.plot([0, 1], [0, 1], c="gray")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title(f"ROC Curve (AUC is {auc:.4f})")
plt.show()

In [5]:
def select_important_features(train_columns, model, modeltype_maybe):
    '''
    Write a function that chooses important features based on an importance threshold.
    Will need to work for a variety of estimator objects, eg a RandomForestClassifier and a GridSearchCV
    fit as a DecisionTreeRegressor.
    '''

    # inspiration code, keep handy
    pd.DataFrame(
        {
            "feature": X_train.columns,
            "importance": forest1.best_estimator_.feature_importances_,
        }
    ).sort_values(by="importance", ascending=False)

In [None]:
def plot_mesh(X, Y, h):
    # Our data. Converting from data frames to arrays for the mesh.
    X = np.array(X)
    Y = np.array(Y)

    # Mesh size.
    h = 4.0

    # Plot the decision boundary. We assign a color to each point in the mesh.
    x_min = X[:, 0].min() - .5
    x_max = X[:, 0].max() + .5
    y_min = X[:, 1].min() - .5
    y_max = X[:, 1].max() + .5
    xx, yy = np.meshgrid(
        np.arange(x_min, x_max, h),
        np.arange(y_min, y_max, h)
    )
    Z = neighbors.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot.
    Z = Z.reshape(xx.shape)
    plt.figure(1, figsize=(6, 4))
    plt.set_cmap(plt.cm.Paired)
    plt.pcolormesh(xx, yy, Z)

    # Add the training points to the plot.
    plt.scatter(X[:, 0], X[:, 1], c=Y)
    plt.xlabel('Loudness')
    plt.ylabel('Duration')
    plt.title('Mesh visualization')

    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

    plt.show()

In [None]:
# get column names back after selectkbest

feature_names = list(X.columns.values)

mask = selector.get_support() #list of booleans
new_features = [] # The list of your K best features

for bool, feature in zip(mask, feature_names):
    if bool:
        new_features.append(feature)
        
best_data = pd.DataFrame(X_best, columns=new_features)

In [1]:
feature_importance = clf.feature_importances_

# Make importances relative to max importance.
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

NameError: name 'clf' is not defined

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import LeaveOneOutEncoder

# Replace with whatever model import(s) you're using
from xgboost import XGBClassifier


# Fill out your column datatypes here
num_cols = []

bin_cols = []

cat_cols = []
drop_cats = []


preprocessing = ColumnTransformer([
    # Should only use one of these
    # Comment out or delete one of the below 2 lines
    ('OneHotEncoder', OneHotEncoder(drop=drop_cats), cat_cols),    
    ('leaveoneoutencoder', LeaveOneOutEncoder(), cat_cols),

    # Scale numeric columns (not needed for all models but can't hurt)
    ('scaler', StandardScaler(), num_cols)
    
    # bin_cols we'll leave untouch
], remainder='passthrough')


pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # Choose your model and put it here
    ('model', XGBClassifier())
])


grid = {
    # Use model__ with hyperprammeter names after
    'model__n_estimators':[100, 150]    
}

pipeline_cv = GridSearchCV(pipeline, grid)
pipeline_cv.fit(X_train, y_train)

print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from category_encoders import LeaveOneOutEncoder
# Replace with whatever model import(s) you're using
from xgboost import XGBClassifier
# Fill out your column datatypes here
num_cols = []
bin_cols = []
cat_cols = []
drop_cats = []
preprocessing = ColumnTransformer([
    # Should only use one of these
    # Comment out or delete one of the below 2 lines
    ('OneHotEncoder', OneHotEncoder(drop=drop_cats), cat_cols),
    ('leaveoneoutencoder', LeaveOneOutEncoder(), cat_cols),
    # Scale numeric columns (not needed for all models but can't hurt)
    ('scaler', StandardScaler(), num_cols)
    # bin_cols we'll leave untouch
], remainder='passthrough')
pipeline = Pipeline([
    ('preprocessing', preprocessing),
    # Choose your model and put it here
    ('model', XGBClassifier())
])
grid = {
    # Use model__ with hyperprammeter names after
    'model__n_estimators':[100, 150]
}
pipeline_cv = GridSearchCV(pipeline, grid)
pipeline_cv.fit(X_train, y_train)
print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))