In [1]:
import warnings
warnings.filterwarnings('ignore')
# modules we'll use
import pandas as pd
import numpy as np
import matplotlib as plt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, RobustScaler
import sklearn
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import explained_variance_score
import scipy.stats as stats
from sklearn.cluster import KMeans
from matplotlib import cm
from sklearn.feature_selection import SelectKBest, RFE, f_regression
import viz
import os
import missingno as msno
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
#Access Stored Data
np.random.seed(4)

In [2]:
def new_wine_data():
    '''
    This function queries data from two data.world datasets and reads
    them into pandas DataFrames. One is for red wine 
    (https://query.data.world/s/hn4uhqtyxklbrtbdbxmku47vueahfw?dws=00000)
    The other is for white wine.
    (https://query.data.world/s/r2mrliiyey6g2rn54wqmx3pvcylez7?dws=00000)
    The function then adds a column 'red_wine' to each df to designate 
    whether each wine is white or red. Then the two dfs are concatenated
    into a single dataframe which is returned
    
    Arguments: None
    
    Returns: DataFrame of properties queried
    '''
    
    # Read in red wine data from data.world.
    df_r = pd.read_csv(
        'https://query.data.world/s/572bfogx33kophnqyp3lwse7pguchi?dws=00000')
    
    # Read in white wine data from data.world.
    df_w = pd.read_csv(
        'https://query.data.world/s/r2mrliiyey6g2rn54wqmx3pvcylez7?dws=00000')
    
    # Add 'is_red' column to each df'
    df_r['is_red'] = 1
    df_w['is_red'] = 0
    
    # Concatenate two dfs
    df = pd.concat([df_r, df_w])
    
    return df


def get_wine_data():
    '''
    This function checks to see if there is a local version of 'wine.csv'.
    If it finds one, it reads it into a DataFrame and returns that df.
    If it does not find one, it runs 'new_wine_data()' to pull the data
    from the host and convert to a df. Then it writes that df to a local
    file 'wine.csv' and returns the df. Function relies
    on other functions in the wrangle.py module.
    '''
    if os.path.isfile('wine.csv'):
        
        # If csv file exists read in data from csv file.
        df = pd.read_csv('wine.csv', index_col=0)
        
    else:
        
        # Read fresh data from db into a DataFrame
        df = new_wine_data()
        
        # Cache data
        df.to_csv('wine.csv')
        
    return df

In [3]:
df = new_wine_data()

In [4]:
df.index.is_unique
df.index.duplicated()
df = df.loc[~df.index.duplicated(), :]

In [5]:
def outlier_function(df, cols, k):
    '''
    This function takes in a dataframe, column, and k
    to detect and handle outlier using IQR rule
    '''
    for col in df[cols]:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        upper_bound =  q3 + k * iqr
        lower_bound =  q1 - k * iqr     
        df = df[(df[col] < upper_bound) & (df[col] > lower_bound)]
        return df

In [6]:
df = outlier_function(df,k=1.5,cols=['fixed acidity',
                     'volatile acidity',
                     'citric acid',
                     'residual sugar',
                     'chlorides',
                     'free sulfur dioxide',
                     'total sulfur dioxide',
                     'density',
                     'pH',
                     'sulphates',
                     'alcohol',
                     'quality'])

In [7]:
df['wine_quality'] = pd.cut(df.quality, [0,5,7,10])

In [8]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,is_red,wine_quality
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1,"(0, 5]"
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,1,"(0, 5]"
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,1,"(0, 5]"
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,1,"(0, 5]"
5,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,1,"(0, 5]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,0,"(5, 7]"
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,0,"(0, 5]"
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,0,"(5, 7]"
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,0,"(5, 7]"


In [9]:
df = pd.get_dummies(df, columns = ['is_red'])

In [10]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,wine_quality,is_red_0,is_red_1
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,"(0, 5]",0,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5,"(0, 5]",0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5,"(0, 5]",0,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5,"(0, 5]",0,1
5,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,5,"(0, 5]",0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,6,"(5, 7]",1,0
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,5,"(0, 5]",1,0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,"(5, 7]",1,0
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,"(5, 7]",1,0


In [11]:
df = pd.get_dummies(df, columns = ['wine_quality'])

In [16]:
df.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'is_red_0', 'is_red_1',
       'wine_quality_(0, 5]', 'wine_quality_(5, 7]', 'wine_quality_(7, 10]'],
      dtype='object')

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,is_red_0,is_red_1
0,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0,1
1,7.8,0.88,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,0,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,0,1
4,7.4,0.70,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,0,1
5,7.4,0.66,0.00,1.8,0.075,13.0,40.0,0.99780,3.51,0.56,9.4,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.50,11.2,1,0
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.99490,3.15,0.46,9.6,1,0
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,1,0
4896,5.5,0.29,0.30,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,1,0


In [27]:
from sklearn.model_selection import train_test_split

def train_validate_test_split(df, target, seed=123):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed 
                                            )
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed
                                       )
    return train, validate, test



In [35]:
# split into train, validate, test
train, validate, test = train_validate_test_split(df, target='quality', seed=123)

# create X & y version of train, where y is a series with just the target variable and X are all the features. 
X_train = train
X_validate = validate
X_test = test


y_train = train[['quality','wine_quality_(0, 5]','wine_quality_(5, 7]','wine_quality_(7, 10]']]
X_train = X_train.drop(columns=['quality','wine_quality_(0, 5]','wine_quality_(5, 7]','wine_quality_(7, 10]'])


y_validate = train[['quality','wine_quality_(0, 5]','wine_quality_(5, 7]','wine_quality_(7, 10]']]
X_validate = X_validate.drop(columns=['quality','wine_quality_(0, 5]','wine_quality_(5, 7]','wine_quality_(7, 10]'])


y_test = train[['quality','wine_quality_(0, 5]','wine_quality_(5, 7]','wine_quality_(7, 10]']]
X_test = X_test.drop(columns=['quality','wine_quality_(0, 5]','wine_quality_(5, 7]','wine_quality_(7, 10]'])



Unnamed: 0,quality,"wine_quality_(0, 5]","wine_quality_(5, 7]","wine_quality_(7, 10]"
4873,6,0,1,0
4740,5,1,0,0
3306,7,0,1,0
2454,5,1,0,0
3538,7,0,1,0
...,...,...,...,...
2079,4,1,0,0
3498,6,0,1,0
2498,5,1,0,0
1153,6,0,1,0


In [37]:
X_train.shape, X_validate.shape, X_test.shape

((2568, 13), (1101, 13), (918, 13))

In [38]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=3, 
                            random_state=123)


In [39]:
rf.fit(X_train, y_train)


In [45]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2568 entries, 4873 to 4009
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         2568 non-null   float64
 1   volatile acidity      2568 non-null   float64
 2   citric acid           2568 non-null   float64
 3   residual sugar        2568 non-null   float64
 4   chlorides             2568 non-null   float64
 5   free sulfur dioxide   2568 non-null   float64
 6   total sulfur dioxide  2568 non-null   float64
 7   density               2568 non-null   float64
 8   pH                    2568 non-null   float64
 9   sulphates             2568 non-null   float64
 10  alcohol               2568 non-null   float64
 11  is_red_0              2568 non-null   uint8  
 12  is_red_1              2568 non-null   uint8  
dtypes: float64(11), uint8(2)
memory usage: 245.8 KB


In [40]:
print(rf.feature_importances_)

[0.01403467 0.13519276 0.06400977 0.02634046 0.06543727 0.04064578
 0.0341578  0.19275082 0.00895687 0.03013732 0.37191205 0.00959321
 0.00683122]


In [47]:
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))


ValueError: multiclass-multioutput is not supported

In [42]:
print(confusion_matrix(y_train, y_pred))


ValueError: multiclass-multioutput is not supported

In [None]:
print(classification_report(y_train, y_pred))

In [None]:
print('Accuracy of random forest classifier on test set: {:.2f}'
     .format(rf.score(X_validate, y_validate)))