<a href="https://colab.research.google.com/github/GArdennes/GArdennes/blob/main/Random_Forests.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Due to the drawback of Decision trees we consider an upgrade to decision trees called Random Forests. Decision trees easily overfit and with reach result have high variance in the results unless a seed is used. Although decision trees are superior to logical regression models in that they do not carry assumptions into processing the data. Logical regression models assume the data can be split into two exact categories which is not the case for all data. Random forests contain multiple trees.

In [None]:
#bootstrap sampling to create multiple datasets of same size and reduce overfitting
#bagging (applying an ensemble classifier like Random forest) <- aggregating(applying classifier model) <- bootstrapping(resampling) <- orginal data
#at the aggregating of the data we go through decorrelation of the decision tree classifer, 
#this means adding restrictions to the trees to introduce variances thus making the decision trees more random.

In [None]:
#using Random Forest
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

cancer_data = load_breast_cancer()
df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names'])
df['target'] = cancer_data['target']

X = df[cancer_data.feature_names].values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101) # random state affects the accuracy of the model, another suggestion was k-fold algorithm to ensure reliable vales

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
print("random forest accuracy:", rf.score(X_test, y_test))

----------------------------------------------------BREAKAWAYOP-------------------------------------------------------------------------------------

------------------------------------------------------BREAKAWAYCL-------------------------------------------------------------------------------------

In [None]:
#using Decision Trees
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

cancer_data = load_breast_cancer()
df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names'])
df['target'] = cancer_data['target']

X = df[cancer_data.feature_names].values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
print("decision tree accuracy:", dt.score(X_test, y_test))

In [None]:
#n_estimators (the number of trees)
#max_features (the number of features to consider at each split)
#rf = RandomForestClassifier(max_features=5) default
#rf = RandomForestClassifier(n_estimators=15) default
#Grid search to implement k-fold cross validation
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

cancer_data = load_breast_cancer()
df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names'])
df['target'] = cancer_data['target']

X = df[cancer_data.feature_names].values
y = df['target'].values

param_grid = {
    'n_estimators': [10, 25, 50, 75, 100],
}

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 101) # random state affects the accuracy of the model,k-fold algorithm to ensure reliable vales

rf = RandomForestClassifier()
gs = GridSearchCV(rf, param_grid, scoring='f1', cv=5) #cv = 5 for 5 fold cross validation
gs.fit(X_train, y_train)
print("best params:", gs.best_params_)
print("random forest accuracy:", gs.score(X_test, y_test))

In [None]:
#feature_importance helps us determine in what measure the features contribute to the target
#the higher the number the greater the contribution
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
cancer_data = load_breast_cancer()
df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names'])
df['target'] = cancer_data['target']

X = df[cancer_data.feature_names].values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)
rf = RandomForestClassifier(n_estimators=10)
rf.fit(X_train, y_train)

ft_imp = pd.Series(rf.feature_importances_, index=cancer_data.feature_names).sort_values(ascending=False)
print(ft_imp.head(10))

In [None]:
#feature selection enables us build relevant subsets of the data thus neglecting data noise
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

cancer_data = load_breast_cancer()
df = pd.DataFrame(cancer_data['data'], columns=cancer_data['feature_names'])
df['target'] = cancer_data['target']

X = df[cancer_data.feature_names].values
y = df['target'].values

rf = RandomForestClassifier(n_estimators=10, random_state=111)

worst_cols = [col for col in df.columns if 'worst' in col]
X_worst = df[worst_cols]
X_train, X_test, y_train, y_test = train_test_split(X_worst, y, random_state=101)
rf.fit(X_train, y_train)
print("random forest accuracy:",rf.score(X_test, y_test))

Comparing Logistic Regression to Random Forests on a generated dataset

In [None]:
from sklearn.datasets import make_circles
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

X, y = make_circles(noise=0.2, factor=0.5, random_state=1)

kf = KFold(n_splits=5, shuffle=True, random_state=1)
lr_scores = []
rf_scores = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr = LogisticRegression(solver='lbfgs')
    lr.fit(X_train, y_train)
    lr_scores.append(lr.score(X_test, y_test))
    rf = RandomForestClassifier(n_estimators=100)
    rf.fit(X_train, y_train)
    rf_scores.append(rf.score(X_test, y_test))
print("LR accuracy:", np.mean(lr_scores))
print("RF accuracy:", np.mean(rf_scores))