In [1]:
import pandas as pd

dataset = pd.read_csv("Social_Network_Ads.csv")
dataset

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [2]:
dataset = dataset.drop("User ID", axis = 1)
dataset = pd.get_dummies(dataset, drop_first = True, dtype = int)
dataset.columns

Index(['Age', 'EstimatedSalary', 'Purchased', 'Gender_Male'], dtype='object')

In [3]:
independent = dataset.drop(columns = ['Purchased'])
dependent = dataset[['Purchased']]
dependent.value_counts()

Purchased
0            257
1            143
Name: count, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(independent, dependent, test_size = 0.30, random_state = 0) 
x_train, x_test, y_train, y_test

(     Age  EstimatedSalary  Gender_Male
 92    26            15000            1
 223   60           102000            1
 234   38           112000            0
 232   40           107000            1
 377   42            53000            0
 ..   ...              ...          ...
 323   48            30000            0
 192   29            43000            1
 117   36            52000            1
 47    27            54000            0
 172   26           118000            0
 
 [280 rows x 3 columns],
      Age  EstimatedSalary  Gender_Male
 132   30            87000            1
 309   38            50000            0
 341   35            75000            1
 196   30            79000            0
 246   35            50000            0
 ..   ...              ...          ...
 216   49            65000            1
 259   45           131000            0
 49    31            89000            0
 238   46            82000            0
 343   47            51000            0
 
 [120 rows 

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [6]:
def random_forest_model(criterion_param, max_features_param, random_state_param): 
    try:
        classifier = RandomForestClassifier(criterion = criterion_param, max_features = max_features_param, random_state = random_state_param)
        classifier.fit(x_train, y_train)
        y_pred = classifier.predict(x_test)
        cm = confusion_matrix(y_test, y_pred)
        clf_report = accuracy_score(y_test, y_pred)
        print(str(cm) + ": Criterion = " + criterion_param + " max_features = " + str(max_features_param) + " random_state = " + str(random_state_param))
        print("Accuracy: {:.4f}%".format(clf_report))
    except Exception as e:
        print(e)

In [7]:
criterion_list = ['gini','entropy','log_loss']
max_features_list = [None,'sqrt','log2']
random_state_list = [None, 0, 42]

for criterion in criterion_list:
    for max_features in max_features_list:
        for random_state in random_state_list:
            random_forest_model(criterion, max_features, random_state)

[[75  4]
 [ 4 37]]: Criterion = gini max_features = None random_state = None
Accuracy: 0.9333%
[[74  5]
 [ 4 37]]: Criterion = gini max_features = None random_state = 0
Accuracy: 0.9250%
[[74  5]
 [ 4 37]]: Criterion = gini max_features = None random_state = 42
Accuracy: 0.9250%
[[74  5]
 [ 5 36]]: Criterion = gini max_features = sqrt random_state = None
Accuracy: 0.9167%
[[73  6]
 [ 5 36]]: Criterion = gini max_features = sqrt random_state = 0
Accuracy: 0.9083%
[[72  7]
 [ 5 36]]: Criterion = gini max_features = sqrt random_state = 42
Accuracy: 0.9000%
[[73  6]
 [ 5 36]]: Criterion = gini max_features = log2 random_state = None
Accuracy: 0.9083%
[[73  6]
 [ 5 36]]: Criterion = gini max_features = log2 random_state = 0
Accuracy: 0.9083%
[[72  7]
 [ 5 36]]: Criterion = gini max_features = log2 random_state = 42
Accuracy: 0.9000%
[[74  5]
 [ 4 37]]: Criterion = entropy max_features = None random_state = None
Accuracy: 0.9250%
[[74  5]
 [ 5 36]]: Criterion = entropy max_features = None ra

In [8]:
classifier = RandomForestClassifier(criterion = 'entropy', max_features = None, random_state = None)
classifier = classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)

clf_report = classification_report(y_test, y_pred)
print(clf_report)

[[74  5]
 [ 4 37]]
              precision    recall  f1-score   support

           0       0.95      0.94      0.94        79
           1       0.88      0.90      0.89        41

    accuracy                           0.93       120
   macro avg       0.91      0.92      0.92       120
weighted avg       0.93      0.93      0.93       120



In [9]:
classifier.predict([[40,30000,1]])

array([0], dtype=int64)

In [11]:
import pickle as pck
filename = "random_forest_model.sav"
pck.dump(classifier, open(filename, 'wb'))