## Use Case 1

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style(style='darkgrid')
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
heart_disease = pd.read_csv('datasets/heart-disease.csv')
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
# extracting the dependent and independent variables
# Using another way to extract dependent and independent variables
'''X = heart_disease.iloc[:, :-1].values # independent Variables (Every other column except Profit Column)
y = heart_disease.iloc[:, 4].values # dependent Variables (Profit)'''
x = heart_disease.drop('target', axis=1)
y = heart_disease['target']

In [4]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier() #create an instance of Random Forest
# We'll keep the default parameters

In [5]:
# Fit the model to the training data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


In [6]:
clf.fit(x_train, y_train)

In [7]:
# make prediction
y_pred = clf.predict(x_test)
y_pred

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1], dtype=int64)

In [8]:
y_test

256    0
174    0
202    0
122    1
21     1
      ..
214    0
92     1
48     1
90     1
39     1
Name: target, Length: 61, dtype: int64

In [9]:
# Evaluate the model
clf.score(x_train, y_train)

1.0

In [10]:
clf.score(x_test, y_test)

0.8524590163934426

In [11]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print('Classification report ::\n',classification_report(y_test, y_pred))
print('Confusion Matrix ::\n',confusion_matrix(y_test, y_pred),'\n')
print('Accuracy Score ::\n',accuracy_score(y_test, y_pred))

Classification report ::
               precision    recall  f1-score   support

           0       0.93      0.79      0.86        34
           1       0.78      0.93      0.85        27

    accuracy                           0.85        61
   macro avg       0.86      0.86      0.85        61
weighted avg       0.86      0.85      0.85        61

Confusion Matrix ::
 [[27  7]
 [ 2 25]] 

Accuracy Score ::
 0.8524590163934426


In [12]:
# Lets try to improve the model
# Try different amount of n_estimatore
np.random.seed(42)
for i in range(10, 100, 20):
    print(f'Trying this model with {i} estimators...')
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(x_train, y_train)
    print(f'Model Accuracy Score on this test set is :: {clf.score(x_test, y_test) * 100:.2f}%\n')

Trying this model with 10 estimators...
Model Accuracy Score on this test set is :: 75.41%

Trying this model with 30 estimators...
Model Accuracy Score on this test set is :: 85.25%

Trying this model with 50 estimators...
Model Accuracy Score on this test set is :: 85.25%

Trying this model with 70 estimators...
Model Accuracy Score on this test set is :: 81.97%

Trying this model with 90 estimators...
Model Accuracy Score on this test set is :: 83.61%



## Use Case 2 - Use case using data with string values in them
Where you have to convert the string values to numbers

In [15]:
car_sales = pd.read_csv('Datasets/car-sales-extended.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [17]:
# length of the dataset
len(car_sales)

1000

In [19]:
# Datatypes of the dataset
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [20]:
# extracting the dependent and independent variables
# Using another way to extract dependent and independent variables
'''X = car_sales.iloc[:, :-1].values # independent Variables (Every other column except Profit Column)
y = car_sales.iloc[:, 4].values # dependent Variables (Profit)'''
xx = car_sales.drop('Price', axis=1)
yy = car_sales['Price']

In [32]:
# Encoding categorical data
# Scikit Learn cannot work with Strings so we convert those strings to numbers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
categorical_features = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, categorical_features)],remainder='passthrough')
transformed_xx = transformer.fit_transform(xx)
transformed_xx
'''ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))'''

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [34]:
pd.DataFrame(transformed_xx)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [41]:
# Another way we can do this instead of using the OneCodeEncoder
dummies = pd.get_dummies(car_sales[['Make','Colour','Doors']])
dummies
# I keep getthing True or False instead of 0s and 1s

Unnamed: 0,Doors,Make_BMW,Make_Honda,Make_Nissan,Make_Toyota,Colour_Black,Colour_Blue,Colour_Green,Colour_Red,Colour_White
0,4,False,True,False,False,False,False,False,False,True
1,5,True,False,False,False,False,True,False,False,False
2,4,False,True,False,False,False,False,False,False,True
3,4,False,False,False,True,False,False,False,False,True
4,3,False,False,True,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...
995,4,False,False,False,True,True,False,False,False,False
996,3,False,False,True,False,False,False,False,False,True
997,4,False,False,True,False,False,True,False,False,False
998,4,False,True,False,False,False,False,False,False,True


In [42]:
# Fit the model to the training data
from sklearn.model_selection import train_test_split

xx_train, xx_test, yy_train, yy_test = train_test_split(transformed_xx, yy, test_size=0.2)


In [43]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor() #create an instance of Random Forest
# We'll keep the default parameters

In [44]:
model.fit(xx_train, yy_train)

In [45]:
model.score(xx_test, yy_test)

0.20126033382149655

## Notice
For Random forest model, we may use the following models
> Random Forest Classifier
> 
> Random Forest Regressor

Random Forest Classifier and Random Forest Regressor are both ensemble learning methods based on decision trees. The classifier is used for classification tasks, where it predicts the class of an input, while the regressor is used for regression tasks, where it predicts a continuous value. They work by constructing multiple decision trees during training and outputting the mode or mean prediction of the individual trees, respectively, during inference.