#### Task 1: Import libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn

#### Task 2: Open file
Open the file called 'RedWineQual.csv' and display the first 10 rows.
The target value that we want to predict is quality.
- 0 = low
- 1 = high


In [2]:
redwine = pd.read_csv('RedWineQual.csv')
print(redwine.head(10))

   fixed_acidity  volatile_acidity  total_sulfur_dioxide  density  sulphates  \
0            7.4              0.70                    34   0.9978       0.56   
1            7.8              0.88                    67   0.9968       0.68   
2            7.8              0.76                    54   0.9970       0.65   
3           11.2              0.28                    60   0.9980       0.58   
4            7.4              0.70                    34   0.9978       0.56   
5            7.4              0.66                    40   0.9978       0.56   
6            7.9              0.60                    59   0.9964       0.46   
7            7.3              0.65                    21   0.9946       0.47   
8            7.8              0.58                    18   0.9968       0.57   
9            7.5              0.50                   102   0.9978       0.80   

   alcohol  quality  
0      9.4        0  
1      9.8        0  
2      9.8        0  
3      9.8        1  
4      9.

#### Task 3: Define Training Features
Create a dataset called X which includes all features except for quality.

Output the top 10 rows to verify the data.

In [3]:
X = redwine.drop('quality', axis=1)
print(X.head(10))

   fixed_acidity  volatile_acidity  total_sulfur_dioxide  density  sulphates  \
0            7.4              0.70                    34   0.9978       0.56   
1            7.8              0.88                    67   0.9968       0.68   
2            7.8              0.76                    54   0.9970       0.65   
3           11.2              0.28                    60   0.9980       0.58   
4            7.4              0.70                    34   0.9978       0.56   
5            7.4              0.66                    40   0.9978       0.56   
6            7.9              0.60                    59   0.9964       0.46   
7            7.3              0.65                    21   0.9946       0.47   
8            7.8              0.58                    18   0.9968       0.57   
9            7.5              0.50                   102   0.9978       0.80   

   alcohol  
0      9.4  
1      9.8  
2      9.8  
3      9.8  
4      9.4  
5      9.4  
6      9.4  
7     10.0  
8 

#### Task 4: Define Target Feature
Create a dataset called y which includes only the quality feature.

Output the top 10 rows to verify the data.

In [4]:
y = redwine['quality']
print(y.head(10))

0    0
1    0
2    0
3    1
4    0
5    0
6    0
7    1
8    1
9    0
Name: quality, dtype: int64


#### Task 5: Split the data set into training and testing partitions
Split the 2 feature data sets so that 25% of the data is in the testing partition.  Use the value of 42 as the random state. Store the results into four data sets called X_train, X_test, y_train, and y_test.

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

#### Task 6: Compare partition means
Compare the means of the target feature (quality) for the training and testing partition. Print these means to the output and verify that they are close (within 5%).

In [7]:
train_mean = y_train.mean()
print(f"Training Partition Mean: {train_mean:.5f}")

test_mean = y_test.mean()
print(f"Testing Partition Mean: {test_mean:.3f}")

difference = abs(train_mean - test_mean) / train_mean * 100
print(f"Percentage Difference: {difference:.2f}%")


Training Partition Mean: 0.52794
Testing Partition Mean: 0.555
Percentage Difference: 5.13%


#### Task 7: Define classifier
Define the decision tree classifier with a max depth of 4 levels,  Fit the model to the data.

In [8]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(max_depth=4, random_state=42)

dt_model.fit(X_train, y_train)

#### Task 8: Generate Results
Generate a set of predictions based on the model using the Training Features in the training partition.  Report the results in a confusion matrix.

In [9]:
from sklearn.metrics import confusion_matrix

y_train_pred = dt_model.predict(X_train)

cm = confusion_matrix(y_train, y_train_pred)
print("Confusion Matrix:\n", cm)

Confusion Matrix:
 [[446 120]
 [149 484]]


#### Task 9: Calculate and report the accuracy score
Calculate the accuracy of the model by comparing the actual values of Y against the predicted values of Y.  Report this accuracy at a level of 3 decimal places.

In [11]:
from sklearn.metrics import accuracy_score

train_accuracy = accuracy_score(y_train, y_train_pred)

print(f"Training Accuracy: {train_accuracy:.3f}")

Training Accuracy: 0.776


#### Task 10: Report the most influential features
Identify the most influential features in prediction the quality of the wine and report them in decending order.

In [12]:
importances = dt_model.feature_importances_

feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print(feature_importances)

                Feature  Importance
5               alcohol    0.571476
4             sulphates    0.182659
1      volatile_acidity    0.116517
2  total_sulfur_dioxide    0.110724
3               density    0.012698
0         fixed_acidity    0.005927
