In [1]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
#from sklearn.linear_model import LinearRegression 
from sklearn import metrics 
#import matplotlib.pyplot as plt 
import numpy as np 
#import seaborn as sns
import mlflow
from mlflow import sklearn
from sklearn import preprocessing
from sklearn import tree

In [2]:
data = pd.read_csv('winequality-red.csv', sep=";")
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
data.min()

fixed acidity           4.60000
volatile acidity        0.12000
citric acid             0.00000
residual sugar          0.90000
chlorides               0.01200
free sulfur dioxide     1.00000
total sulfur dioxide    6.00000
density                 0.99007
pH                      2.74000
sulphates               0.33000
alcohol                 8.40000
quality                 3.00000
dtype: float64

In [4]:
data.max()

fixed acidity            15.90000
volatile acidity          1.58000
citric acid               1.00000
residual sugar           15.50000
chlorides                 0.61100
free sulfur dioxide      72.00000
total sulfur dioxide    289.00000
density                   1.00369
pH                        4.01000
sulphates                 2.00000
alcohol                  14.90000
quality                   8.00000
dtype: float64

In [5]:
# there are no categorical variables. each feature is a number. Regression problem. 
# Given the set of values for features, we have to predict the quality of wine. finding correlation of each feature with our target variable - quality
#correlations = df.corr()['quality'].drop('quality')
#print(correlations)

y = data.quality
X = data.drop('quality', axis=1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2)
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
245,7.3,0.66,0.0,2.0,0.084,6.0,23.0,0.9983,3.61,0.96,9.9
1282,7.9,0.765,0.0,2.0,0.084,9.0,22.0,0.99619,3.33,0.68,10.9
871,6.9,0.56,0.03,1.5,0.086,36.0,46.0,0.99522,3.53,0.57,10.6
83,7.3,0.67,0.26,1.8,0.401,16.0,51.0,0.9969,3.16,1.14,9.4
656,10.7,0.43,0.39,2.2,0.106,8.0,32.0,0.9986,2.89,0.5,9.6


In [7]:
X_train_scaled = preprocessing.scale(X_train)
X_train_scaled

array([[-0.57848206,  0.73833211, -1.37585333, ...,  1.91882369,
         1.82231183, -0.48531267],
       [-0.2343525 ,  1.31988232, -1.37585333, ...,  0.11693272,
         0.13511867,  0.45135628],
       [-0.80790176,  0.18447477, -1.22108496, ...,  1.4039977 ,
        -0.52770721,  0.1703556 ],
       ...,
       [-0.69319191, -0.53553978,  0.7909039 , ...,  0.69611196,
         0.31588937,  0.0766887 ],
       [-1.15203132, -0.0924539 , -0.70519038, ...,  0.69611196,
         0.55691696,  0.54502318],
       [ 0.10977706,  0.3229391 , -0.44724309, ..., -0.71965951,
        -1.07001929, -0.57897957]])

In [8]:
clf=tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [9]:
confidence = clf.score(X_test, y_test)
print("\nThe confidence score:\n")
print(confidence)


The confidence score:

0.59375


In [10]:
y_pred = clf.predict(X_test)

In [11]:

#converting the numpy array to list
x=np.array(y_pred).tolist()

#printing first 5 predictions
print("\nThe prediction:\n")
for i in range(0,5):
    print( x[i])
    
#printing first five expectations
print("\nThe expectation:\n")
print( y_test.head())


The prediction:

6
5
6
5
6

The expectation:

1125    7
509     7
884     6
282     5
872     4
Name: quality, dtype: int64


In [12]:
mlflow.set_experiment('WineQuality')
with mlflow.start_run():
    
    mlflow.log_metric("rmse", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    mlflow.log_metric("r2", metrics.mean_squared_error(y_test, y_pred))
    mlflow.log_metric("mae", metrics.mean_absolute_error(y_test, y_pred))
    
    conda = {
        'name': 'mlflow-env',
        'channels': ['defaults'],
        'dependencies': [
            'python=3.7.0',
            'scikit-learn=0.19.2'
        ]
    }

    mlflow.sklearn.log_model(clf, 'model', serialization_format=mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE, conda_env = conda)