# Random Forest Classifier

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv("Social_Network_Ads.csv")
df

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0
...,...,...,...,...,...
395,15691863,Female,46,41000,1
396,15706071,Male,51,23000,1
397,15654296,Female,50,20000,1
398,15755018,Male,36,33000,0


In [4]:
df.isnull().sum()

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64

In [5]:
df = df.drop(columns = ['User ID'])

In [6]:
df.head(5)

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


In [7]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
df['Gender'] = lb.fit_transform(df['Gender'])
df.head(3)

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,1,19,19000,0
1,1,35,20000,0
2,0,26,43000,0


In [8]:
x = df.drop(columns = ['Purchased'])
y = df['Purchased']

In [9]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
accuracy = accuracy_score(y_test,y_pred)
print(f"Accuracy: {accuracy*100:.2f}%")

Accuracy: 90.00%


In [11]:
from sklearn.model_selection import RandomizedSearchCV
param_dist = {
    'n_estimators': [200, 500, 1000],          
    'max_depth': [None, 10, 20, 30, 50],      
    'min_samples_split': [2, 5, 10],          
    'min_samples_leaf': [1, 2, 4],            
    'max_features': ['sqrt', 'log2', None],   
    'bootstrap': [True, False]                
}


rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,               
    cv=5,                     
    verbose=2,
    n_jobs=-1,
    random_state=42
)


random_search.fit(x_train, y_train)


print("Best Parameters:", random_search.best_params_)


best_rf = random_search.best_estimator_
y_pred = best_rf.predict(x_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Tuned Accuracy: {accuracy*100:.2f}%")

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'n_estimators': 1000, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 30, 'bootstrap': True}
Tuned Accuracy: 92.50%


# Click Dataset

In [13]:
df = pd.read_csv("click.csv")
df.head(3)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Gender,Country,Timestamp,Clicked on Ad
0,62.26,32.0,69481.85,172.83,Decentralized real-time circuit,Lisafort,Male,Svalbard & Jan Mayen Islands,2016-06-09 21:43:05,0
1,41.73,31.0,61840.26,207.17,Optional full-range projection,West Angelabury,Male,Singapore,2016-01-16 17:56:05,0
2,44.4,30.0,57877.15,172.83,Total 5thgeneration standardization,Reyesfurt,Female,Guadeloupe,2016-06-29 10:50:45,0


In [14]:
df.isnull().sum()

Daily Time Spent on Site    0
Age                         0
Area Income                 0
Daily Internet Usage        0
Ad Topic Line               0
City                        0
Gender                      0
Country                     0
Timestamp                   0
Clicked on Ad               0
dtype: int64

In [15]:
df = df.drop(columns=['Timestamp'])
df.head(3)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Gender,Country,Clicked on Ad
0,62.26,32.0,69481.85,172.83,Decentralized real-time circuit,Lisafort,Male,Svalbard & Jan Mayen Islands,0
1,41.73,31.0,61840.26,207.17,Optional full-range projection,West Angelabury,Male,Singapore,0
2,44.4,30.0,57877.15,172.83,Total 5thgeneration standardization,Reyesfurt,Female,Guadeloupe,0


In [16]:
numeric = ['Daily Time Spent on Site','Age','Area Income','Daily Internet Usage']
categoric = ['Ad Topic Line', 'City','Gender','Country']


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder

numeric_transfromer = Pipeline(steps=[
    ['Impute',SimpleImputer(strategy='mean')],
    ['Standarize',StandardScaler()]
])

categoric_transformer = Pipeline(steps =[
    ['Impute',SimpleImputer(strategy = 'most_frequent')],
    ['Encoding',OneHotEncoder(handle_unknown='ignore')]
])

preprocessor = ColumnTransformer(transformers = [
    ['Numeric',numeric_transfromer,numeric],
    ['Categoric', categoric_transformer,categoric]
])

In [18]:
preprocessor

In [19]:
x = df.drop(columns = ['Clicked on Ad'])
y = df['Clicked on Ad']
df.head(3)

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Gender,Country,Clicked on Ad
0,62.26,32.0,69481.85,172.83,Decentralized real-time circuit,Lisafort,Male,Svalbard & Jan Mayen Islands,0
1,41.73,31.0,61840.26,207.17,Optional full-range projection,West Angelabury,Male,Singapore,0
2,44.4,30.0,57877.15,172.83,Total 5thgeneration standardization,Reyesfurt,Female,Guadeloupe,0


In [20]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
x_train_new = preprocessor.fit_transform(x_train)
x_test_new = preprocessor.transform(x_test)

In [None]:
rf = RandomForestClassifier(n_estimators=1000, random_state=42)
rf.fit(x_train_new, y_train)

y_pred = rf.predict(x_test_new)
accuracy = accuracy_score(y_test, y_pred)
print(f"Base Accuracy: {accuracy*100:.2f}%")


param_dist = {
    'n_estimators': [200, 500, 1000],          
    'max_depth': [None, 10, 20, 30, 50],      
    'min_samples_split': [2, 5, 10],          
    'min_samples_leaf': [1, 2, 4],            
    'max_features': ['sqrt', 'log2', None],   
    'bootstrap': [True, False]                
}

rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42
)


random_search.fit(x_train_new, y_train)

print("Best Parameters:", random_search.best_params_)


best_rf = random_search.best_estimator_
y_pred = best_rf.predict(x_test_new)

accuracy = accuracy_score(y_test, y_pred)
print(f"Tuned Accuracy: {accuracy*100:.2f}%")