## Wine Quality 数据集 随机森林在分类任务中的应用

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [1]:
from sklearn.datasets import load_wine

In [2]:
wine= load_wine()

In [3]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

In [4]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [5]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

:Number of Instances: 178
:Number of Attributes: 13 numeric, predictive attributes and the class
:Attribute Information:
    - Alcohol
    - Malic acid
    - Ash
    - Alcalinity of ash
    - Magnesium
    - Total phenols
    - Flavanoids
    - Nonflavanoid phenols
    - Proanthocyanins
    - Color intensity
    - Hue
    - OD280/OD315 of diluted wines
    - Proline
    - class:
        - class_0
        - class_1
        - class_2

:Summary Statistics:

                                Min   Max   Mean     SD
Alcohol:                      11.0  14.8    13.0   0.8
Malic Acid:                   0.74  5.80    2.34  1.12
Ash:                          1.36  3.23    2.36  0.27
Alcalinity of Ash:            10.6  30.0    19.5   3.3
Magnesium:                    70.0 162.0    99.7  14.3
Total Phenols:                0.98  3.88    2.29  0.63
Flavanoids:                   0.34  5.08    2.03  1.00

## Convert the data into Pandas Dataframe

In [6]:
import pandas as pd
df = pd.DataFrame(wine.data, columns=wine.feature_names)

In [7]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [8]:
df.tail(3)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.2,0.59,1.56,835.0
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.3,0.6,1.62,840.0
177,14.13,4.1,2.74,24.5,96.0,2.05,0.76,0.56,1.35,9.2,0.61,1.6,560.0


In [9]:
df.shape

(178, 13)

In [10]:
# add the target in our dataframe.

In [11]:
df['Target']= wine.target

In [12]:
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,Target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [13]:
df.shape

(178, 14)

In [14]:
df.columns

Index(['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue',
       'od280/od315_of_diluted_wines', 'proline', 'Target'],
      dtype='object')

In [15]:
df.dtypes

alcohol                         float64
malic_acid                      float64
ash                             float64
alcalinity_of_ash               float64
magnesium                       float64
total_phenols                   float64
flavanoids                      float64
nonflavanoid_phenols            float64
proanthocyanins                 float64
color_intensity                 float64
hue                             float64
od280/od315_of_diluted_wines    float64
proline                         float64
Target                            int32
dtype: object

In [16]:
df.isnull().sum()  ## to check if there are any null values

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
Target                          0
dtype: int64

In [17]:
df['Target'].value_counts()

Target
1    71
0    59
2    48
Name: count, dtype: int64

## Model Building

In [18]:
from sklearn.model_selection import train_test_split                         
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [19]:
# split out validation dataset
array = df.values
X = array[:, 0:13]
Y = array[:, 12]
validation_size=.33
seed= 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X,Y, test_size=validation_size, random_state= seed)

In [20]:
print(Y)

[1065. 1050. 1185. 1480.  735. 1450. 1290. 1295. 1045. 1045. 1510. 1280.
 1320. 1150. 1547. 1310. 1280. 1130. 1680.  845.  780.  770. 1035. 1015.
  845.  830. 1195. 1285.  915. 1035. 1285. 1515.  990. 1235. 1095.  920.
  880. 1105. 1020.  760.  795. 1035. 1095.  680.  885. 1080. 1065.  985.
 1060. 1260. 1150. 1265. 1190. 1375. 1060. 1120.  970. 1270. 1285.  520.
  680.  450.  630.  420.  355.  678.  502.  510.  750.  718.  870.  410.
  472.  985.  886.  428.  392.  500.  750.  463.  278.  714.  630.  515.
  520.  450.  495.  562.  680.  625.  480.  450.  495.  290.  345.  937.
  625.  428.  660.  406.  710.  562.  438.  415.  672.  315.  510.  488.
  312.  680.  562.  325.  607.  434.  385.  407.  495.  345.  372.  564.
  625.  465.  365.  380.  380.  378.  352.  466.  342.  580.  630.  530.
  560.  600.  650.  695.  720.  515.  580.  590.  600.  780.  520.  550.
  855.  830.  415.  625.  650.  550.  500.  480.  425.  675.  640.  725.
  480.  880.  660.  620.  520.  680.  570.  675.  6

In [21]:
print(X_train.shape)

(119, 13)


In [22]:
print(Y_train.shape)

(119,)


In [23]:
print(X_train)

[[1.438e+01 3.590e+00 2.280e+00 ... 1.040e+00 3.440e+00 1.065e+03]
 [1.345e+01 3.700e+00 2.600e+00 ... 8.500e-01 1.560e+00 6.950e+02]
 [1.237e+01 1.070e+00 2.100e+00 ... 1.040e+00 2.770e+00 6.600e+02]
 ...
 [1.237e+01 1.170e+00 1.920e+00 ... 1.120e+00 3.480e+00 5.100e+02]
 [1.305e+01 2.050e+00 3.220e+00 ... 1.130e+00 3.200e+00 8.300e+02]
 [1.327e+01 4.280e+00 2.260e+00 ... 5.900e-01 1.560e+00 8.350e+02]]


In [24]:
# check the algorithm

models=[]
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))


# evaluate each model
seed = 42  # 你可以选择一个固定的整数
results =[]
names=[]
for name , model in models:
    kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
    cv_results= cross_val_score(model, X_train, Y_train, cv=kfold,scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    
    msg= '%s:, %f, (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)
    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LR:, 0.025000, (0.038188)
LDA:, 0.008333, (0.025000)
KNN:, 0.092424, (0.069168)
CART:, 0.083333, (0.074536)
NB:, 0.277273, (0.129108)
SVM:, 0.041667, (0.041667)


In [25]:

models= []

models.append(('LDA ', LinearDiscriminantAnalysis()))                  
models.append(('KNN', KNeighborsClassifier()))                  
models.append(('CART', DecisionTreeClassifier()))                   
models.append(('NB', GaussianNB()))                 
models.append(('SVM ',   SVC()))                 
models.append(('LR', LogisticRegression()))

In [26]:
# evaluate each model

results =[]
names =[]
for name, model in models:
    kfold=KFold(n_splits=10, shuffle=True, random_state =seed)
    cv_results = cross_val_score(model, X_train, Y_train, cv=kfold,scoring='accuracy')
    
    results.append(cv_results)
    names.append(name)
    
    msg = '%s: %f, (%f)' % (name, cv_results.mean(), cv_results.std())
    print(msg)


LDA : 0.008333, (0.025000)
KNN: 0.092424, (0.069168)
CART: 0.100758, (0.062200)
NB: 0.277273, (0.129108)
SVM : 0.041667, (0.041667)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LR: 0.025000, (0.038188)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [27]:
# make predictions on validation datasets

LDA = LinearDiscriminantAnalysis()
LDA.fit(X_train, Y_train)
predictions= LDA.predict(X_validation)
print(accuracy_score(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))

0.01694915254237288
              precision    recall  f1-score   support

       278.0       0.00      0.00      0.00         0
       325.0       0.00      0.00      0.00         1
       342.0       0.00      0.00      0.00         1
       345.0       0.00      0.00      0.00         0
       352.0       0.00      0.00      0.00         1
       355.0       0.00      0.00      0.00         0
       365.0       0.00      0.00      0.00         1
       378.0       0.00      0.00      0.00         1
       380.0       0.00      0.00      0.00         1
       385.0       0.00      0.00      0.00         0
       392.0       0.00      0.00      0.00         0
       406.0       0.00      0.00      0.00         1
       407.0       0.00      0.00      0.00         1
       415.0       0.00      0.00      0.00         0
       420.0       0.00      0.00      0.00         1
       428.0       0.00      0.00      0.00         1
       434.0       0.00      0.00      0.00         0
       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
