In [1]:
import pandas as pd
import numpy as np
import acquire as a
import new_lib as nl
import prepare as p
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
import warnings
warnings.filterwarnings("ignore")
from itertools import combinations
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
wine = a.acquire_wine()
wine = p.prepare_wine(wine)
wine = wine[['volatile_acidity', 'citric_acid', 'sugar', 'density', 'alcohol', 'quality']]
# acquire and prepare the data using functions from acquire.py and prepare.py

In [3]:
train, val, test, X_train, y_train, X_val, y_val, X_test, y_test = nl.train_vailidate_test_split(wine, 'quality')
# splitting data into train, validate, test splits

In [4]:
train_scaled, val_scaled, test_scaled = nl.scale_splits(X_train, X_val, X_test, MinMaxScaler(), columns = ['volatile_acidity', 
                                                                                                           'citric_acid', 'sugar', 
                                                                                                           'density', 
                                                                                                           'alcohol'])
# creating scaled data for modeling and testing purposes

In [5]:
train_scaled
# checking the scale

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol
3899,0.146667,0.198795,0.051780,0.156846,0.666667
5268,0.193333,0.192771,0.029126,0.084055,0.681159
332,0.333333,0.168675,0.080906,0.436742,0.202899
274,0.380000,0.108434,0.203883,0.516898,0.202899
4326,0.060000,0.186747,0.025890,0.160312,0.362319
...,...,...,...,...,...
2916,0.046667,0.204819,0.022654,0.237435,0.405797
2129,0.013333,0.210843,0.135922,0.324090,0.246377
5437,0.220000,0.132530,0.132686,0.319757,0.173913
481,0.146667,0.337349,0.067961,0.397747,0.536232


In [6]:
df = train_scaled

In [7]:
df['quality'] = train.quality

In [8]:
df

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality
3899,0.146667,0.198795,0.051780,0.156846,0.666667,7
5268,0.193333,0.192771,0.029126,0.084055,0.681159,7
332,0.333333,0.168675,0.080906,0.436742,0.202899,6
274,0.380000,0.108434,0.203883,0.516898,0.202899,5
4326,0.060000,0.186747,0.025890,0.160312,0.362319,6
...,...,...,...,...,...,...
2916,0.046667,0.204819,0.022654,0.237435,0.405797,5
2129,0.013333,0.210843,0.135922,0.324090,0.246377,6
5437,0.220000,0.132530,0.132686,0.319757,0.173913,5
481,0.146667,0.337349,0.067961,0.397747,0.536232,8


In [9]:
X = df[['volatile_acidity', 'alcohol']]
kmeans = KMeans(n_clusters=3, random_state = 77)
kmeans.fit(X)
kmeans.predict(X)
# creating a kmeans and fitting it
df['v_a'] = kmeans.predict(X)
# predicitons for the clusters
df.head()

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality,v_a
3899,0.146667,0.198795,0.05178,0.156846,0.666667,7,1
5268,0.193333,0.192771,0.029126,0.084055,0.681159,7,1
332,0.333333,0.168675,0.080906,0.436742,0.202899,6,2
274,0.38,0.108434,0.203883,0.516898,0.202899,5,2
4326,0.06,0.186747,0.02589,0.160312,0.362319,6,0


In [10]:
Y = df[['density', 'alcohol']]
kmeans = KMeans(n_clusters=3, random_state = 42)
kmeans.fit(Y)
kmeans.predict(Y)
# creating a kmeans and fitting it
df['d_a'] = kmeans.predict(Y)
# predicitons for the clusters
df.head()

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality,v_a,d_a
3899,0.146667,0.198795,0.05178,0.156846,0.666667,7,1,2
5268,0.193333,0.192771,0.029126,0.084055,0.681159,7,1,2
332,0.333333,0.168675,0.080906,0.436742,0.202899,6,2,1
274,0.38,0.108434,0.203883,0.516898,0.202899,5,2,1
4326,0.06,0.186747,0.02589,0.160312,0.362319,6,0,0


In [11]:
Z = df[['sugar', 'density']]
kmeans = KMeans(n_clusters=3, random_state = 41)
kmeans.fit(Z)
kmeans.predict(Z)
# creating a kmeans and fitting it
df['s_d'] = kmeans.predict(Z)
# predicitons for the clusters
df.head()

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality,v_a,d_a,s_d
3899,0.146667,0.198795,0.05178,0.156846,0.666667,7,1,2,1
5268,0.193333,0.192771,0.029126,0.084055,0.681159,7,1,2,1
332,0.333333,0.168675,0.080906,0.436742,0.202899,6,2,1,2
274,0.38,0.108434,0.203883,0.516898,0.202899,5,2,1,2
4326,0.06,0.186747,0.02589,0.160312,0.362319,6,0,0,1


In [12]:
A = df[['volatile_acidity', 'citric_acid']]
kmeans = KMeans(n_clusters=3, random_state = 12)
kmeans.fit(A)
kmeans.predict(A)
# creating a kmeans and fitting it
df['v_c'] = kmeans.predict(A)
# predicitons for the clusters
df.head()

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality,v_a,d_a,s_d,v_c
3899,0.146667,0.198795,0.05178,0.156846,0.666667,7,1,2,1,2
5268,0.193333,0.192771,0.029126,0.084055,0.681159,7,1,2,1,2
332,0.333333,0.168675,0.080906,0.436742,0.202899,6,2,1,2,0
274,0.38,0.108434,0.203883,0.516898,0.202899,5,2,1,2,0
4326,0.06,0.186747,0.02589,0.160312,0.362319,6,0,0,1,2


In [13]:
a = pd.get_dummies(df.v_a)
a

Unnamed: 0,0,1,2
3899,0,1,0
5268,0,1,0
332,0,0,1
274,0,0,1
4326,1,0,0
...,...,...,...
2916,0,1,0
2129,1,0,0
5437,1,0,0
481,0,1,0


In [14]:
b = pd.get_dummies(df.d_a)
c = pd.get_dummies(df.s_d)
d = pd.get_dummies(df.v_c)

In [15]:
a = a.rename(columns = {0: 'v_a1', 1: 'v_a2', 2:'v_a3'})
b = b.rename(columns = {0: 'd_a1', 1: 'd_a2', 2:'d_a3'})
c = c.rename(columns = {0: 's_d1', 1: 's_d2', 2:'s_d3'})
d = d.rename(columns = {0: 'v_c1', 1: 'v_c2', 2:'v_c3'})

In [16]:
df = pd.concat([df,a,b,c,d], axis = 1)

In [17]:
df = df.drop(columns = ['volatile_acidity', 'citric_acid', 'sugar', 'density', 'alcohol', 'v_a', 'd_a', 's_d', 'v_c'])

In [18]:
df['qual'] = df['quality'] >= 7
df

Unnamed: 0,quality,v_a1,v_a2,v_a3,d_a1,d_a2,d_a3,s_d1,s_d2,s_d3,v_c1,v_c2,v_c3,qual
3899,7,0,1,0,0,0,1,0,1,0,0,0,1,True
5268,7,0,1,0,0,0,1,0,1,0,0,0,1,True
332,6,0,0,1,0,1,0,0,0,1,1,0,0,False
274,5,0,0,1,0,1,0,0,0,1,1,0,0,False
4326,6,1,0,0,1,0,0,0,1,0,0,0,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2916,5,0,1,0,1,0,0,0,1,0,0,0,1,False
2129,6,1,0,0,0,1,0,0,0,1,0,0,1,False
5437,5,1,0,0,0,1,0,0,0,1,0,0,1,False
481,8,0,1,0,1,0,0,0,0,1,0,1,0,True


In [19]:
df['qual'] = np.where(df['qual'] == True, 1, 0)

In [20]:
df = df.drop(columns = 'quality')

In [21]:
df = df.rename(columns = {'qual': 'is_good'})

In [22]:
df

Unnamed: 0,v_a1,v_a2,v_a3,d_a1,d_a2,d_a3,s_d1,s_d2,s_d3,v_c1,v_c2,v_c3,is_good
3899,0,1,0,0,0,1,0,1,0,0,0,1,1
5268,0,1,0,0,0,1,0,1,0,0,0,1,1
332,0,0,1,0,1,0,0,0,1,1,0,0,0
274,0,0,1,0,1,0,0,0,1,1,0,0,0
4326,1,0,0,1,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2916,0,1,0,1,0,0,0,1,0,0,0,1,0
2129,1,0,0,0,1,0,0,0,1,0,0,1,0
5437,1,0,0,0,1,0,0,0,1,0,0,1,0
481,0,1,0,1,0,0,0,0,1,0,1,0,1


In [23]:
df['baseline'] = 0

In [24]:
len(df.is_good[df.is_good == 0])/len(df)

0.7973604619191641

In [25]:
df_y = df['is_good']
df_y

3899    1
5268    1
332     0
274     0
4326    0
       ..
2916    0
2129    0
5437    0
481     1
3107    0
Name: is_good, Length: 3637, dtype: int64

In [26]:
df = df.drop(columns = ['is_good', 'baseline'])

In [27]:
train_tree = DecisionTreeClassifier(max_depth=7, random_state=77)

In [28]:
train_tree = train_tree.fit(df, df_y)
train_tree

In [29]:
y_pred = train_tree.predict(df)
y_pred_proba = train_tree.predict_proba(df)

In [30]:
train_tree.score(df, df_y)

0.8017596920538905

In [31]:
def rfe(X, y, n):
    '''
    This function takes in the features, target variable 
    and number of top features desired and returns a dataframe with
    the features ranked
    '''
    from sklearn.linear_model import LinearRegression
    from sklearn.feature_selection import SelectKBest, f_regression, RFE
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=n)
    rfe.fit(X, y)
    ranks = rfe.ranking_
    columns = X.columns.tolist()
    feature_ranks = pd.DataFrame({'ranking': ranks, 'feature': columns})
    return feature_ranks.sort_values('ranking')
# Function to create feature engineering using rfe to select best features

In [32]:
# Feture engineering usings RFE to confirm best features.
feature_ranks = rfe(df, df_y, 6)
feature_ranks

Unnamed: 0,ranking,feature
0,1,v_a1
1,1,v_a2
2,1,v_a3
9,1,v_c1
10,1,v_c2
11,1,v_c3
8,2,s_d3
6,3,s_d1
7,4,s_d2
4,5,d_a2


In [33]:
df1 = df

In [34]:
df1 = df1[['v_a1', 'v_a2', 'v_a3', 'v_c1', 'v_c2', 'v_c3']]

In [35]:
train_tree = train_tree.fit(df1, df_y)
train_tree

In [36]:
y_pred = train_tree.predict(df1)
y_pred_proba = train_tree.predict_proba(df1)

In [37]:
train_tree.score(df1, df_y)

0.7973604619191641

In [194]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight={0:1.4, 1:1}, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=200,
                            max_depth=4, 
                            random_state=123)

In [195]:
rf.fit(df, df_y)

In [196]:
y_pred = rf.predict(df)
y_pred_proba = rf.predict_proba(df)


In [197]:
rf.score(df, df_y)

0.8003849326367886