In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset=pd.read_csv("Social_Network_Ads.csv")

In [3]:
dataset=pd.get_dummies(dataset,drop_first=True)

In [4]:
dataset=dataset.drop("User ID",axis=1)

In [5]:
dataset

Unnamed: 0,Age,EstimatedSalary,Purchased,Gender_Male
0,19,19000,0,1
1,35,20000,0,1
2,26,43000,0,0
3,27,57000,0,0
4,19,76000,0,1
...,...,...,...,...
395,46,41000,1,0
396,51,23000,1,1
397,50,20000,1,0
398,36,33000,0,1


In [6]:
dataset["Purchased"].value_counts()

0    257
1    143
Name: Purchased, dtype: int64

In [7]:
indep=dataset[["Age","EstimatedSalary","Gender_Male"]]
dep=dataset[["Purchased"]]

In [8]:
indep.shape

(400, 3)

In [9]:
dep

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0
...,...
395,1
396,1
397,1
398,0


In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(indep, dep, test_size = 1/3, random_state = 0)

In [11]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
from sklearn.model_selection import GridSearchCV
param_grid = {'criterion':['gini','entropy'],'max_features': ['auto','sqrt','log2'],'n_estimators':[10,100]}
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit = True, verbose = 3,n_jobs=-1,scoring='f1_weighted')
grid.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    1.4s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
           

In [21]:
re=grid.cv_results_
grid_predictions = grid.predict(X_test)

In [22]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, grid_predictions)

In [23]:
print("The Confusion Matrix:",cm)

The Confusion Matrix: [[79  6]
 [ 4 45]]


In [24]:
from sklearn.metrics import classification_report
clf_report = classification_report(y_test, grid_predictions)

In [25]:
print("The Report:",clf_report)

The Report:               precision    recall  f1-score   support

           0       0.95      0.93      0.94        85
           1       0.88      0.92      0.90        49

    accuracy                           0.93       134
   macro avg       0.92      0.92      0.92       134
weighted avg       0.93      0.93      0.93       134



In [26]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test,grid.predict_proba(X_test)[:,1])

0.9696278511404561

In [27]:
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.034656,0.004986,0.005332,0.001886268,gini,auto,10,"{'criterion': 'gini', 'max_features': 'auto', ...",0.843254,0.865289,0.861245,0.856496,0.009611,11
1,0.239929,0.003265,0.025326,0.001885706,gini,auto,100,"{'criterion': 'gini', 'max_features': 'auto', ...",0.843254,0.876797,0.909091,0.876132,0.026931,6
2,0.025327,0.001887,0.010663,0.00679527,gini,sqrt,10,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.829715,0.88822,0.907497,0.874802,0.033182,7
3,0.221268,0.013195,0.023993,0.003265458,gini,sqrt,100,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.866667,0.876797,0.897334,0.880164,0.012764,5
4,0.027992,0.003264,0.005332,0.001885425,gini,log2,10,"{'criterion': 'gini', 'max_features': 'log2', ...",0.855026,0.887741,0.881672,0.874664,0.014258,8
5,0.211938,0.003265,0.025326,0.001884919,gini,log2,100,"{'criterion': 'gini', 'max_features': 'log2', ...",0.866667,0.865865,0.920734,0.884288,0.025628,3
6,0.02666,0.001885,0.003999,7.786718e-07,entropy,auto,10,"{'criterion': 'entropy', 'max_features': 'auto...",0.853824,0.852792,0.897334,0.867877,0.020716,9
7,0.222601,0.001886,0.02266,0.001884807,entropy,auto,100,"{'criterion': 'entropy', 'max_features': 'auto...",0.878187,0.876797,0.909091,0.887951,0.014875,2
8,0.027991,0.003262,0.004683,0.0009674107,entropy,sqrt,10,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.831336,0.854397,0.872144,0.852465,0.016745,12
9,0.218602,0.00377,0.02266,0.00188492,entropy,sqrt,100,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.878187,0.854934,0.932275,0.888388,0.03229,1
