In [1]:
import pandas as pd

ads = pd.read_csv("../Data/Social_Network_Ads.csv")

ads.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [2]:
from sklearn.model_selection import train_test_split

X = pd.get_dummies(ads[["Gender", "Age","EstimatedSalary"]], drop_first=True)
y = ads["Purchased"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2023)

In [3]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()

X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Create Models
dt = DecisionTreeClassifier(max_depth=2, min_samples_split=2) 
lr = LogisticRegression(C=3, penalty="l1", solver="saga")
knn = KNeighborsClassifier(n_neighbors=5)

# Fit Models
dt.fit(X_train_std, y_train)
lr.fit(X_train_std, y_train)
knn.fit(X_train_std, y_train)

In [6]:
from sklearn.ensemble import VotingClassifier

lr = LogisticRegression(C=3, penalty="l1", solver="saga")
dt = DecisionTreeClassifier(max_depth=2, min_samples_split=2)
knn = KNeighborsClassifier(n_neighbors=5)

model = VotingClassifier(estimators=[
    ('lr', lr), 
    ('dt', dt),
    ('knn', knn)], 
    voting='hard'
)

model.fit(X_train_std, y_train)
model.score(X_test_std,y_test)

0.8625

In [7]:
model.predict_proba(X_test_std)

AttributeError: predict_proba is not available when voting='hard'

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Create Models
dt = DecisionTreeClassifier(max_depth=2, min_samples_split=2) 
lr = LogisticRegression(C=3, penalty="l1", solver="saga")
knn = KNeighborsClassifier(n_neighbors=5)

model = VotingClassifier(estimators=[
    ('lr', lr), 
    ('dt', dt),
    ('knn', knn)], 
    voting='soft'
)

model.fit(X_train_std, y_train)
model.score(X_test_std,y_test)

0.875

In [9]:
knn = knn.fit(X_train_std, y_train)


In [10]:
model.predict_proba(X_test_std)[:5]

array([[0.1021139 , 0.8978861 ],
       [0.1958674 , 0.8041326 ],
       [0.90687734, 0.09312266],
       [0.98380444, 0.01619556],
       [0.03279746, 0.96720254]])

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=2023)

rf.fit(X_train, y_train)

print(f"Training Accuracy: {rf.score(X_train, y_train)}")
print(f"Test Accuracy: {rf.score(X_test, y_test)}")

In [None]:
import seaborn as sns

sns.barplot(x=rf.feature_importances_, y=X.columns, color="blue");

In [None]:
rf.feature_importances_

In [None]:
import numpy as np
np.arange(start=100, stop=1100, step=100)

In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV



rf = RandomForestClassifier(random_state=2023, n_jobs=-1)

params = {
    'n_estimators': np.arange(start=100, stop=1100, step=100),
    'max_features': [None, "sqrt"],
    'bootstrap': [True, False],
    'max_samples': [None, .3, .5, .9],
    'max_depth': np.arange(start=1, stop=11, step=1),
    'min_samples_leaf': [2, 5, 10, 20, 100]    
}

grid = GridSearchCV(rf, params)

grid.fit(X_train, y_train)

grid.best_params_

In [None]:
rf = RandomForestClassifier(
    random_state=2023,
    n_estimators=200,
    max_features="sqrt",
    bootstrap=False,
    max_samples=None,
    max_depth=4,
    min_samples_leaf=10
)

rf.fit(X_train, y_train)

print(f"Training Accuracy: {rf.score(X_train, y_train)}")
print(f"Test Accuracy: {rf.score(X_test, y_test)}")

In [None]:
rf.feature_importances_

In [None]:
sns.barplot(x=rf.feature_importances_, y=X.columns, color="blue")

In [None]:
from sklearn.model_selection import RandomizedSearchCV
      
rf = RandomForestClassifier(random_state=2023)

params = {
    'n_estimators': np.arange(start=100, stop=1100, step=100),
    'max_features': [None, "sqrt"],
    'bootstrap': [True, False],
    'max_samples': [None, .3, .5, .9],
    'max_depth': np.arange(start=1, stop=11, step=1),
    'min_samples_leaf': [2, 5, 10, 20, 100]    
}

grid = RandomizedSearchCV(
    rf, 
    params,
    n_iter=100,
    scoring="accuracy"
)

grid.fit(X_train, y_train)

grid.best_params_

In [None]:
from sklearn.model_selection import RandomizedSearchCV
      
rf = RandomForestClassifier(random_state=2023)

params = {
    'n_estimators': np.arange(start=100, stop=1100, step=100),
    'max_features': [None, "sqrt"],
    'bootstrap': [True, False],
    'max_samples': [None, .3, .5, .9],
    'max_depth': np.arange(start=1, stop=11, step=1),
    'min_samples_leaf': [2, 5, 10, 20, 100]    
}

grid = RandomizedSearchCV(
    rf, 
    params,
    n_iter=100,
    scoring="accuracy"
)

grid.fit(X_train, y_train)

grid.best_params_

In [None]:
#!conda install shap --y

In [None]:
import shap

explainer = shap.Explainer(rf, X_test.sample(50, random_state=12345))

shap_values = explainer(X_test.sample(50, random_state=12345))

shap.plots.beeswarm(shap_values[:,:,1])

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    random_state=2023,
    'n_estimators': 100,
    'learning_rate': .01,
    'max_features': "sqrt",
    'subsample': 1.0,
    'max_depth': 3,
    'min_samples_leaf': 1
)

gb.fit(X_train, y_train)

print(f"Training Accuracy: {gb.score(X_train, y_train)}")
print(f"Test Accuracy: {gb.score(X_test, y_test)}")

In [None]:
gb = GradientBoostingClassifier(
    random_state=2023,
    'n_estimators': 100,
    'learning_rate': .01,
    'max_features': "sqrt",
    'subsample': 1.0,
    'max_depth': 3,
    'min_samples_leaf': 1
)

In [None]:
import numpy as np

from sklearn.model_selection import RandomizedSearchCV
      
gb = GradientBoostingClassifier(random_state=2023)

params = {
    'n_estimators': np.arange(start=100, stop=1100, step=100),
    'learning_rate': [.001, .01, .1, .2],
    'max_features': [None, "sqrt"],
    'subsample': [.8, .9, 1.0],
    'max_depth': np.arange(start=3, stop=12, step=2),
    'min_samples_leaf': [2, 5, 10]    
}

grid = RandomizedSearchCV(
    gb, 
    params,
    n_iter=100,
    scoring="accuracy"
)

grid.fit(X_train, y_train)

grid.best_params_

In [None]:
import numpy as np

from sklearn.model_selection import GridSearchCV
      
gb = GradientBoostingClassifier(random_state=2023)

params = {
    'n_estimators': np.arange(start=350, stop=450, step=10),
    'learning_rate': [.005, .01, .015, .25],
    'max_features': [None, "sqrt"],
    'subsample': [.9, .95, 1.0],
    'max_depth': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3]    
}

grid = GridSearchCV(
    gb, 
    params,
    scoring="accuracy"
)

grid.fit(X_train, y_train)

grid.best_params_

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    **grid.best_params_,
    random_state=2023
)

gb.fit(X_train, y_train)

print(f"Training Accuracy: {gb.score(X_train, y_train)}")
print(f"Test Accuracy: {gb.score(X_test, y_test)}")

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    learning_rate=.01,
    max_depth=3,
    max_features=None,
    min_samples_leaf=2,
    n_estimators=350,
    subsample=.95,
    random_state=2023
)

gb.fit(X_train, y_train)

print(f"Training Accuracy: {gb.score(X_train, y_train)}")
print(f"Test Accuracy: {gb.score(X_test, y_test)}")

In [None]:
X_test["Gender_Male"] = X_test["Gender_Male"].astype("int")

In [None]:
import shap

# Pass fitted model into shap's Explainer Function
explainer = shap.Explainer(gb, X_test)

# Pass in data to calculate SHAP values for - often just want a sample
shap_values = explainer(X_test.sample(80, random_state=2023))

# shap_values is an array - creating DataFrame so we can view column names
shap_df = pd.DataFrame(shap_values.values, columns=X_test.columns)

shap_df.head()

In [None]:
shap.plots.waterfall(shap_values[1])

In [None]:
shap.plots.force(
    base_value=shap_values.base_values,
    shap_values=shap_values.values, 
    features=X_test,
)

In [None]:
shap.plots.scatter(
    shap_values[:, 1], # plot salary in X
    color=shap_values[:, 0] # color by age
)

In [None]:
shap.plots.beeswarm(shap_values)