In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import os
import warnings
warnings.filterwarnings('ignore')

In [31]:
data = pd.read_csv('winequality-red.csv')
data.head(3)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5


In [32]:
# Classification --> RF algo.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix

In [33]:
data.shape

(1599, 12)

In [34]:
from scipy.stats import zscore # for outliers

In [35]:
z = np.abs(zscore(data))
df1 = data[(z<3).all(axis = 1)]
df1.shape

(1451, 12)

In [36]:
data = df1.copy()

In [37]:
data.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0,1451.0
mean,8.310062,0.52295,0.265382,2.387285,0.081425,15.104755,43.735355,0.99671,3.315934,0.642584,10.421089,5.659545
std,1.646458,0.168531,0.190934,0.862078,0.020966,9.309768,29.441284,0.001716,0.141096,0.129801,1.021588,0.781605
min,5.0,0.12,0.0,1.2,0.038,1.0,6.0,0.9915,2.88,0.33,8.5,4.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,21.0,0.9956,3.22,0.55,9.5,5.0
50%,7.9,0.52,0.25,2.2,0.079,13.0,36.0,0.9967,3.31,0.62,10.2,6.0
75%,9.2,0.63,0.42,2.6,0.089,21.0,58.0,0.9978,3.4,0.72,11.1,6.0
max,13.5,1.04,0.79,6.7,0.226,47.0,145.0,1.0022,3.75,1.16,13.6,8.0


In [38]:
X = data.drop('quality', axis = 1) # it is not binary, it's multiclass (for binary use LogisticRegression.)
Y = data['quality']

In [39]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_sc = sc.fit_transform(X)

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_sc, Y, test_size = 0.10, random_state = 2)

In [41]:
# Let's first visulaize the tree on the data without doing any pre processing 

from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, Y_train)

In [42]:
rfc.score(X_train, Y_train)

1.0

In [43]:
rfc.score(X_test, Y_test)

0.7465753424657534

In [44]:
# Overfitting condition occur here.

In [45]:
y_pr = rfc.predict(X_test)

In [46]:
y_pr

array([6, 6, 5, 5, 5, 6, 6, 6, 7, 6, 6, 5, 5, 6, 6, 5, 6, 5, 5, 7, 6, 6,
       5, 7, 6, 5, 5, 5, 6, 5, 6, 6, 6, 6, 5, 6, 6, 7, 7, 6, 5, 6, 5, 5,
       5, 7, 5, 6, 5, 6, 6, 6, 5, 6, 5, 5, 6, 5, 6, 6, 5, 6, 6, 6, 5, 6,
       6, 5, 5, 6, 7, 6, 6, 5, 5, 5, 5, 6, 7, 5, 6, 5, 5, 5, 5, 6, 5, 5,
       6, 6, 7, 6, 7, 6, 5, 5, 5, 7, 6, 5, 6, 5, 6, 6, 5, 6, 6, 5, 5, 7,
       6, 5, 5, 6, 6, 5, 5, 6, 5, 7, 6, 5, 5, 7, 5, 5, 7, 6, 6, 5, 6, 5,
       6, 5, 5, 6, 6, 6, 6, 5, 5, 5, 5, 7, 6, 7], dtype=int64)

In [47]:
data['quality'].value_counts()

quality
5    617
6    586
7    185
4     47
8     16
Name: count, dtype: int64

In [48]:
# here more numbers show, less numbers ignore(4,8)

In [49]:
Y_test

1292    6
1511    5
175     5
688     5
1344    5
       ..
527     6
1164    5
425     7
1515    6
279     7
Name: quality, Length: 146, dtype: int64

In [50]:
# here we have to set parameters for better accuracy.

In [51]:
print(confusion_matrix(Y_test, y_pr))

[[ 0  5  2  0  0]
 [ 0 47 13  0  0]
 [ 0 12 48  2  0]
 [ 0  0  2 14  0]
 [ 0  0  0  1  0]]


In [53]:
print(classification_report(Y_test, y_pr))

              precision    recall  f1-score   support

           4       0.00      0.00      0.00         7
           5       0.73      0.78      0.76        60
           6       0.74      0.77      0.76        62
           7       0.82      0.88      0.85        16
           8       0.00      0.00      0.00         1

    accuracy                           0.75       146
   macro avg       0.46      0.49      0.47       146
weighted avg       0.71      0.75      0.73       146



In [54]:
# It is not correct our algo --> use balancing or HyperParameterCV

In [55]:
# HyperParameter Tuning

In [56]:
# RandomizedSearchCV for multi algos, but GridSearchCV use for one algo.

In [63]:
# RandomizedSearchCV
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

# max_sample
# max_samples = [200,400,600]
n_estimators = [int(x) for x in np.linspace(10,500,50)]

max_samples = [int(x) for x in np.linspace(100,1000,7)]
max_depth = [int(x) for x in np.linspace(10,500,50)]

# Minimum number of Samples required to split a node
min_samples_split = [int(x) for x in np.linspace(1,20,5)]
min_samples_leaf = [int(x) for x in np.linspace(1,30,10)]

# Create the random grid.
random_grid = {'max_samples': max_samples, 'n_estimators': n_estimators, 'max_depth': max_depth, 'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf, 'criterion':['entropy', 'gini']}

param_grid = {'max_depth': max_depth, 'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf, 'criterion':['entropy', 'gini']}

print(random_grid)
print()
print(param_grid)

{'max_samples': [100, 250, 400, 550, 700, 850, 1000], 'n_estimators': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500], 'min_samples_split': [1, 5, 10, 15, 20], 'min_samples_leaf': [1, 4, 7, 10, 13, 17, 20, 23, 26, 30], 'criterion': ['entropy', 'gini']}

{'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 210, 220, 230, 240, 250, 260, 270, 280, 290, 300, 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, 470, 480, 490, 500], 'min_samples_split': [1, 5, 10, 15, 20]

In [66]:
rfc1 = RandomForestClassifier()
rfc_rcv = RandomizedSearchCV(estimator = rfc1, param_distributions = random_grid, n_iter = 35, cv = 5, verbose = 2, random_state = 0, n_jobs = -1)

# verbose = 2 =>> repeation 2 of ans.
# n_jobs = -1 =>> work parallely 
# n_iter = iteration

# fit the randomized model
rfc_rcv.fit(X_train, Y_train)

Fitting 5 folds for each of 35 candidates, totalling 175 fits


In [67]:
rfc_rcv.best_estimator_

In [73]:
# Let's first visualize the tree on the data without doiing any pre processing 
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_samples = 850, min_samples_split = 10, n_estimators = 340)

rfc.fit(X_train, Y_train)

In [74]:
from sklearn.ensemble import RandomForestClassifier
rfc1 = RandomForestClassifier(criterion = 'gini', max_depth = 10, max_samples = 850, min_samples_split = 10, n_estimators = 340)

rfc1.fit(X_train, Y_train)

In [75]:
y_pred = rfc.predict(X_test)

In [76]:
from sklearn.metrics import classification_report

In [77]:
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           4       0.00      0.00      0.00         7
           5       0.70      0.75      0.73        60
           6       0.67      0.76      0.71        62
           7       0.83      0.62      0.71        16
           8       0.00      0.00      0.00         1

    accuracy                           0.70       146
   macro avg       0.44      0.43      0.43       146
weighted avg       0.67      0.70      0.68       146



In [79]:
y_pred1 = rfc1.predict(X_test)

In [80]:
print(classification_report(Y_test, y_pred1))

              precision    recall  f1-score   support

           4       0.00      0.00      0.00         7
           5       0.68      0.73      0.70        60
           6       0.65      0.71      0.68        62
           7       0.77      0.62      0.69        16
           8       0.00      0.00      0.00         1

    accuracy                           0.67       146
   macro avg       0.42      0.41      0.41       146
weighted avg       0.64      0.67      0.65       146



In [81]:
# completed -----------

In [82]:
# apply this algo on other data --> (loan prediction, customer churn)

In [None]:
# Fraud Detection credit card ---> search data and work on this data.