In [223]:
import pandas as pd
import numpy as np
import acquire as a
import new_lib as nl
import prepare as p
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
from scipy.stats import ttest_ind
from scipy.stats import ttest_1samp
import warnings
warnings.filterwarnings("ignore")
from itertools import combinations
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [224]:
wine = a.acquire_wine()
wine = p.prepare_wine(wine)
wine = wine[['volatile_acidity', 'citric_acid', 'sugar', 'density', 'alcohol', 'quality']]
# acquire and prepare the data using functions from acquire.py and prepare.py

In [227]:
scaler = MinMaxScaler()
 
df_scaled = scaler.fit_transform(wine)
df_scaled = pd.DataFrame(df_scaled, columns=['volatile_acidity', 'citric_acid', 'sugar', 'density', 'alcohol', 'quality'])

In [228]:
df_scaled.head()

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality
0,0.413333,0.0,0.019939,0.206092,0.202899,0.333333
1,0.533333,0.0,0.030675,0.186813,0.26087,0.333333
2,0.453333,0.024096,0.026074,0.190669,0.26087,0.333333
3,0.133333,0.337349,0.019939,0.209948,0.26087,0.5
4,0.413333,0.0,0.019939,0.206092,0.202899,0.333333


In [229]:
X = wine[['volatile_acidity', 'alcohol']]
kmeans = KMeans(n_clusters=3, random_state = 77)
kmeans.fit(X)
kmeans.predict(X)
# creating a kmeans and fitting it
wine['v_a'] = kmeans.predict(X)
# predicitons for the clusters
wine.head()

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality,v_a
0,0.7,0.0,1.9,0.9978,9.4,5,1
1,0.88,0.0,2.6,0.9968,9.8,5,1
2,0.76,0.04,2.3,0.997,9.8,5,1
3,0.28,0.56,1.9,0.998,9.8,6,1
4,0.7,0.0,1.9,0.9978,9.4,5,1


In [230]:
Y = wine[['density', 'alcohol']]
kmeans = KMeans(n_clusters=3, random_state = 42)
kmeans.fit(Y)
kmeans.predict(Y)
# creating a kmeans and fitting it
wine['d_a'] = kmeans.predict(Y)
# predicitons for the clusters
wine.head()

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality,v_a,d_a
0,0.7,0.0,1.9,0.9978,9.4,5,1,0
1,0.88,0.0,2.6,0.9968,9.8,5,1,0
2,0.76,0.04,2.3,0.997,9.8,5,1,0
3,0.28,0.56,1.9,0.998,9.8,6,1,0
4,0.7,0.0,1.9,0.9978,9.4,5,1,0


In [231]:
Z = wine[['sugar', 'density']]
kmeans = KMeans(n_clusters=3, random_state = 41)
kmeans.fit(Z)
kmeans.predict(Z)
# creating a kmeans and fitting it
wine['s_d'] = kmeans.predict(Z)
# predicitons for the clusters
wine.head()

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality,v_a,d_a,s_d
0,0.7,0.0,1.9,0.9978,9.4,5,1,0,0
1,0.88,0.0,2.6,0.9968,9.8,5,1,0,0
2,0.76,0.04,2.3,0.997,9.8,5,1,0,0
3,0.28,0.56,1.9,0.998,9.8,6,1,0,0
4,0.7,0.0,1.9,0.9978,9.4,5,1,0,0


In [232]:
A = wine[['volatile_acidity', 'citric_acid']]
kmeans = KMeans(n_clusters=3, random_state = 12)
kmeans.fit(A)
kmeans.predict(A)
# creating a kmeans and fitting it
wine['v_c'] = kmeans.predict(A)
# predicitons for the clusters
wine.head()

Unnamed: 0,volatile_acidity,citric_acid,sugar,density,alcohol,quality,v_a,d_a,s_d,v_c
0,0.7,0.0,1.9,0.9978,9.4,5,1,0,0,2
1,0.88,0.0,2.6,0.9968,9.8,5,1,0,0,2
2,0.76,0.04,2.3,0.997,9.8,5,1,0,0,2
3,0.28,0.56,1.9,0.998,9.8,6,1,0,0,0
4,0.7,0.0,1.9,0.9978,9.4,5,1,0,0,2


In [233]:
a = pd.get_dummies(wine.v_a)
a

Unnamed: 0,0,1,2
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
...,...,...,...
6492,1,0,0
6493,0,1,0
6494,0,1,0
6495,0,0,1


In [234]:
b = pd.get_dummies(wine.d_a)
c = pd.get_dummies(wine.s_d)
d = pd.get_dummies(wine.v_c)

In [235]:
a = a.rename(columns = {0: 'v_a1', 1: 'v_a2', 2:'v_a3'})
b = b.rename(columns = {0: 'd_a1', 1: 'd_a2', 2:'d_a3'})
c = c.rename(columns = {0: 's_d1', 1: 's_d2', 2:'s_d3'})
d = d.rename(columns = {0: 'v_c1', 1: 'v_c2', 2:'v_c3'})

In [236]:
wine = pd.concat([wine,a,b,c,d], axis = 1)

In [237]:
wine = wine.drop(columns = ['volatile_acidity', 'citric_acid', 'sugar', 'density', 'alcohol', 'v_a', 'd_a', 's_d', 'v_c'])

In [238]:
wine['qual'] = wine['quality'] >= 6
wine

Unnamed: 0,quality,v_a1,v_a2,v_a3,d_a1,d_a2,d_a3,s_d1,s_d2,s_d3,v_c1,v_c2,v_c3,qual
0,5,0,1,0,1,0,0,1,0,0,0,0,1,False
1,5,0,1,0,1,0,0,1,0,0,0,0,1,False
2,5,0,1,0,1,0,0,1,0,0,0,0,1,False
3,6,0,1,0,1,0,0,1,0,0,1,0,0,True
4,5,0,1,0,1,0,0,1,0,0,0,0,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,6,1,0,0,0,1,0,1,0,0,0,1,0,True
6493,5,0,1,0,1,0,0,0,0,1,0,1,0,False
6494,6,0,1,0,1,0,0,1,0,0,0,1,0,True
6495,7,0,0,1,0,0,1,1,0,0,0,1,0,True


In [239]:
wine['qual'] = np.where(wine['qual'] == True, 1, 0)

In [240]:
wine = wine.drop(columns = 'quality')

In [241]:
wine = wine.rename(columns = {'qual': 'is_good'})

In [242]:
wine

Unnamed: 0,v_a1,v_a2,v_a3,d_a1,d_a2,d_a3,s_d1,s_d2,s_d3,v_c1,v_c2,v_c3,is_good
0,0,1,0,1,0,0,1,0,0,0,0,1,0
1,0,1,0,1,0,0,1,0,0,0,0,1,0
2,0,1,0,1,0,0,1,0,0,0,0,1,0
3,0,1,0,1,0,0,1,0,0,1,0,0,1
4,0,1,0,1,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,1,0,0,0,1,0,1,0,0,0,1,0,1
6493,0,1,0,1,0,0,0,0,1,0,1,0,0
6494,0,1,0,1,0,0,1,0,0,0,1,0,1
6495,0,0,1,0,0,1,1,0,0,0,1,0,1


In [243]:
train, val, test, X_train, y_train, X_val, y_val, X_test, y_test = nl.train_vailidate_test_split(wine, 'is_good')
# splitting data into train, validate, test splits

train_scaled, val_scaled, test_scaled = nl.scale_splits(X_train, X_val, X_test, MinMaxScaler(), columns = ['volatile_acidity', 
                                                                                                           'citric_acid', 'sugar', 
                                                                                                           'density', 
                                                                                                           'alcohol'])
# creating scaled data for modeling and testing purposes

In [246]:
X_train
# checking the scale

Unnamed: 0,v_a1,v_a2,v_a3,d_a1,d_a2,d_a3,s_d1,s_d2,s_d3,v_c1,v_c2,v_c3
3899,0,0,1,0,0,1,1,0,0,0,1,0
5268,0,0,1,0,0,1,1,0,0,0,1,0
332,0,1,0,1,0,0,1,0,0,0,0,1
274,0,1,0,1,0,0,0,0,1,0,0,1
4326,1,0,0,0,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
2916,1,0,0,0,1,0,1,0,0,0,1,0
2129,0,1,0,1,0,0,0,0,1,0,1,0
5437,0,1,0,1,0,0,1,0,0,0,1,0
481,0,0,1,0,0,1,1,0,0,1,0,0


df = train_scaled

df['quality'] = train.quality

df

X = df[['volatile_acidity', 'alcohol']]
kmeans = KMeans(n_clusters=3, random_state = 77)
kmeans.fit(X)
kmeans.predict(X)
# creating a kmeans and fitting it
df['v_a'] = kmeans.predict(X)
# predicitons for the clusters
df.head()

Y = df[['density', 'alcohol']]
kmeans = KMeans(n_clusters=3, random_state = 42)
kmeans.fit(Y)
kmeans.predict(Y)
# creating a kmeans and fitting it
df['d_a'] = kmeans.predict(Y)
# predicitons for the clusters
df.head()

Z = df[['sugar', 'density']]
kmeans = KMeans(n_clusters=3, random_state = 41)
kmeans.fit(Z)
kmeans.predict(Z)
# creating a kmeans and fitting it
df['s_d'] = kmeans.predict(Z)
# predicitons for the clusters
df.head()

A = df[['volatile_acidity', 'citric_acid']]
kmeans = KMeans(n_clusters=3, random_state = 12)
kmeans.fit(A)
kmeans.predict(A)
# creating a kmeans and fitting it
df['v_c'] = kmeans.predict(A)
# predicitons for the clusters
df.head()

a = pd.get_dummies(df.v_a)
a

b = pd.get_dummies(df.d_a)
c = pd.get_dummies(df.s_d)
d = pd.get_dummies(df.v_c)

a = a.rename(columns = {0: 'v_a1', 1: 'v_a2', 2:'v_a3'})
b = b.rename(columns = {0: 'd_a1', 1: 'd_a2', 2:'d_a3'})
c = c.rename(columns = {0: 's_d1', 1: 's_d2', 2:'s_d3'})
d = d.rename(columns = {0: 'v_c1', 1: 'v_c2', 2:'v_c3'})

df = pd.concat([df,a,b,c,d], axis = 1)

df = df.drop(columns = ['volatile_acidity', 'citric_acid', 'sugar', 'density', 'alcohol', 'v_a', 'd_a', 's_d', 'v_c'])

df['qual'] = df['quality'] >= 6
df

df['qual'] = np.where(df['qual'] == True, 1, 0)

df = df.drop(columns = 'quality')

df = df.rename(columns = {'qual': 'is_good'})

df

In [247]:
train.head()

Unnamed: 0,v_a1,v_a2,v_a3,d_a1,d_a2,d_a3,s_d1,s_d2,s_d3,v_c1,v_c2,v_c3,is_good
3899,0,0,1,0,0,1,1,0,0,0,1,0,1
5268,0,0,1,0,0,1,1,0,0,0,1,0,1
332,0,1,0,1,0,0,1,0,0,0,0,1,1
274,0,1,0,1,0,0,0,0,1,0,0,1,0
4326,1,0,0,0,1,0,1,0,0,0,1,0,1


In [248]:
train['baseline'] = 1

In [249]:
len(train.is_good[train.is_good == 1])/len(train)

0.6395380808358537

df_y = df['is_good']
df_y

df = df.drop(columns = ['is_good', 'baseline'])

In [251]:
train_tree = DecisionTreeClassifier(max_depth=7, random_state=77)

In [285]:
train_tree = train_tree.fit(X_train, y_train)
train_tree

DecisionTreeClassifier(max_depth=7, random_state=77)

In [286]:
y_pred = train_tree.predict(X_train)
y_pred_proba = train_tree.predict_proba(X_train)

In [287]:
train_tree.score(X_train, y_train)

0.7110255705251581

def rfe(X, y, n):
    '''
    This function takes in the features, target variable 
    and number of top features desired and returns a dataframe with
    the features ranked
    '''
    from sklearn.linear_model import LinearRegression
    from sklearn.feature_selection import SelectKBest, f_regression, RFE
    lm = LinearRegression()
    rfe = RFE(lm, n_features_to_select=n)
    rfe.fit(X, y)
    ranks = rfe.ranking_
    columns = X.columns.tolist()
    feature_ranks = pd.DataFrame({'ranking': ranks, 'feature': columns})
    return feature_ranks.sort_values('ranking')
# Function to create feature engineering using rfe to select best features

# Feture engineering usings RFE to confirm best features.
feature_ranks = rfe(df, df_y, 6)
feature_ranks

df1 = df

df1 = df1[['v_a1', 'v_a2', 'v_a3', 'v_c1', 'v_c2', 'v_c3']]

train_tree = train_tree.fit(df, df_y)
train_tree

y_pred = train_tree.predict(df)
y_pred_proba = train_tree.predict_proba(df)

train_tree.score(df, df_y)

In [288]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,
                            n_estimators=200,
                            max_depth=4, 
                            random_state=123)

In [289]:
rf.fit(X_train, y_train)

RandomForestClassifier(max_depth=4, n_estimators=200, random_state=123)

In [290]:
y_pred = rf.predict(X_train)
y_pred_proba = rf.predict_proba(X_train)


In [291]:
rf.score(X_train, y_train)

0.7104756667583173

In [292]:
logit1 = LogisticRegression(C=1, 
                           random_state=123, 
                           intercept_scaling=1, 
                           solver='liblinear')

In [293]:
logit1.fit(X_train, y_train)

LogisticRegression(C=1, random_state=123, solver='liblinear')

In [294]:
y_pred = logit1.predict(X_train)
y_pred_proba = logit1.predict_proba(X_train)

In [295]:
logit1.score(X_train, y_train)

0.7041517734396481

In [296]:
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [297]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [298]:
y_pred = knn.predict(X_train)
y_pred_proba = knn.predict_proba(X_train)

In [299]:
knn.score(X_train, y_train)

0.7047016772064889

In [300]:
y_pred = train_tree.predict(X_val)
y_pred_proba = train_tree.predict_proba(X_val)

In [301]:
train_tree.score(X_val, y_val)

0.7108974358974359

In [302]:
y_pred = rf.predict(X_val)
y_pred_proba = rf.predict_proba(X_val)

In [280]:
rf.score(X_val, y_val)

0.7096153846153846

In [281]:
y_pred = logit1.predict(X_val)
y_pred_proba = logit1.predict_proba(X_val)

In [282]:
logit1.score(X_val, y_val)

0.6903846153846154

In [283]:
y_pred = knn.predict(X_val)
y_pred_proba = knn.predict_proba(X_val)

In [284]:
knn.score(X_val, y_val)

0.7019230769230769

In [303]:
y_pred = train_tree.predict(X_test)
y_pred_proba = train_tree.predict_proba(X_test)

In [304]:
train_tree.score(X_test, y_test)

0.7161538461538461