In [1]:
# Import Modules
from pathlib import Path
from sqlalchemy import create_engine, text
from warnings import simplefilter
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report


### Read CSV file into Pandas DataFrames

In [2]:
# Load the CSV file into a DataFrame
cleanwine_df = pd.read_csv('winemag-data-clean.csv')

# Display the first few rows of the DataFrame
cleanwine_df.head()

Unnamed: 0,country,points,price,province,region_1,taster_name,title,variety,winery,vintage_year,style,rating_category
0,Portugal,87,15.0,Douro,Douro,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,red,good
1,US,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,unknown,good
2,US,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013,white,good
3,US,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012,red,good
4,Spain,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,2011,red,good


In [3]:
## Count rows of dataframe
cleanwine_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116765 entries, 0 to 116764
Data columns (total 12 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   country          116765 non-null  object 
 1   points           116765 non-null  int64  
 2   price            116765 non-null  float64
 3   province         116765 non-null  object 
 4   region_1         116765 non-null  object 
 5   taster_name      116765 non-null  object 
 6   title            116765 non-null  object 
 7   variety          116765 non-null  object 
 8   winery           116765 non-null  object 
 9   vintage_year     116765 non-null  int64  
 10  style            116765 non-null  object 
 11  rating_category  116765 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 10.7+ MB


In [4]:
# Count occurrences of unique values in various columns
print(cleanwine_df['title'].value_counts())
print(cleanwine_df['variety'].value_counts())
print(cleanwine_df['winery'].value_counts())

title
Château Lestage Simon 2012  Haut-Médoc                                         4
Vignerons des Pierres Dorées 2015 Salamandre d'Or  (Coteaux Bourguignons)      4
Domaine Vacheron 2015  Sancerre                                                4
Château de Rochemorin 2013  Pessac-Léognan                                     3
Woodinville Wine Cellars 2012 Reserve Syrah (Columbia Valley (WA))             3
                                                                              ..
Fattoria di Casalbosco 2011  Chianti                                           1
Erzetič 2013 Damski Rosé (Goriska Brda)                                        1
Four Lanterns 2014 Sunset Rosé (Paso Robles)                                   1
Foley 2013 Rancho Santa Rosa Pinot Noir (Sta. Rita Hills)                      1
Domaine Schoffit 2012 Lieu-dit Harth Cuvée Caroline Gewurztraminer (Alsace)    1
Name: count, Length: 107345, dtype: int64
variety
Pinot Noir                  12649
Chardonnay         

In [5]:
# Drop title and winery columns
cleanwine_df = cleanwine_df.drop(columns= ['title', 'winery'], axis=1)


In [6]:
## Drop columns where style of wine is 'unknown'
cleanwine_df = cleanwine_df[cleanwine_df['style'] != 'unknown']

In [7]:
# Check counts for new type column.
cleanwine_df['style'].value_counts()

style
red      47347
white    25476
rosé      3269
Name: count, dtype: int64

In [8]:
# Generate a list of categorical variable column names
application_categories = cleanwine_df.select_dtypes(include=['object']).columns.tolist()

# Create a OneHotEncoder instance with sparse_output=False
enc = OneHotEncoder(sparse_output=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(cleanwine_df[application_categories]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(application_categories)

# Display the first few rows of the encoded dataframe
encode_df.head()


Unnamed: 0,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Brazil,country_Bulgaria,country_Canada,country_Chile,country_China,country_Croatia,...,variety_White Blend,variety_White Riesling,variety_Zinfandel,style_red,style_rosé,style_white,rating_category_average,rating_category_excellent,rating_category_good,rating_category_very good
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [9]:
# Merge one-hot encoded features and drop the original categorical columns
cleanwine_df = cleanwine_df.merge(encode_df, left_index=True, right_index=True)

# Drop the original categorical columns
cleanwine_df = cleanwine_df.drop(application_categories, axis=1)

# Display the first few rows of the modified DataFrame
cleanwine_df.head()


Unnamed: 0,points,price,vintage_year,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Brazil,country_Bulgaria,country_Canada,...,variety_White Blend,variety_White Riesling,variety_Zinfandel,style_red,style_rosé,style_white,rating_category_average,rating_category_excellent,rating_category_good,rating_category_very good
0,87,15.0,2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,87,13.0,2013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,87,65.0,2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,87,15.0,2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,87,19.0,2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [10]:
print(cleanwine_df.columns.tolist())

['points', 'price', 'vintage_year', 'country_Argentina', 'country_Armenia', 'country_Australia', 'country_Austria', 'country_Brazil', 'country_Bulgaria', 'country_Canada', 'country_Chile', 'country_China', 'country_Croatia', 'country_Cyprus', 'country_Czech Republic', 'country_England', 'country_France', 'country_Georgia', 'country_Germany', 'country_Greece', 'country_Hungary', 'country_India', 'country_Israel', 'country_Italy', 'country_Lebanon', 'country_Luxembourg', 'country_Macedonia', 'country_Mexico', 'country_Moldova', 'country_Morocco', 'country_New Zealand', 'country_Peru', 'country_Portugal', 'country_Romania', 'country_Serbia', 'country_Slovakia', 'country_Slovenia', 'country_South Africa', 'country_Spain', 'country_Switzerland', 'country_Turkey', 'country_US', 'country_Ukraine', 'country_Uruguay', 'province_Achaia', 'province_Aconcagua Costa', 'province_Aconcagua Valley', 'province_Aegean', 'province_Agioritikos', 'province_Ahr', 'province_Alenquer', 'province_Alentejano', 

## Random Forest Machine Learning

In [None]:
# Assume the target column is 'rating_category_good' and the features are the rest of the columns
X = cleanwine_df.copy()
X = cleanwine_df.drop(columns=['rating_category_average', 'rating_category_excellent', 'rating_category_good', 'rating_category_very good'], axis=1)

# Create the target
y = cleanwine_df['rating_category_good']

In [12]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [13]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [14]:
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [15]:
# Create a random forest regression.
rf_model = RandomForestClassifier(n_estimators=250, random_state=27)

In [16]:
# Fit the model (this will take a while)
rf_model = rf_model.fit(X_train_scaled, y_train)

### Make Predictions

In [17]:
# Make predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

### Evaluate Model

In [18]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [19]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3581,2434
Actual 1,2391,3961


Accuracy Score : 0.6098487911377052
Classification Report
              precision    recall  f1-score   support

         0.0       0.60      0.60      0.60      6015
         1.0       0.62      0.62      0.62      6352

    accuracy                           0.61     12367
   macro avg       0.61      0.61      0.61     12367
weighted avg       0.61      0.61      0.61     12367



## Imprortance of Features

In [20]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2811759630141477, 'price'),
 (0.1937730445566925, 'points'),
 (0.19357655153895456, 'vintage_year'),
 (0.006870942212401807, 'taster_name_unknown'),
 (0.005276694646382002, 'variety_Pinot Noir'),
 (0.005259363538133585, 'variety_Cabernet Sauvignon'),
 (0.0049988813138138405, 'variety_Red Blend'),
 (0.004455894195296706, 'taster_name_Virginie Boone'),
 (0.004410620826412446, 'province_California'),
 (0.004177179634857395, 'variety_Merlot'),
 (0.0038827698898985254, 'variety_Chardonnay'),
 (0.0037794385959592225, 'variety_Bordeaux-style Red Blend'),
 (0.0034663348799981422, 'style_red'),
 (0.0033314631589495165, 'variety_Sauvignon Blanc'),
 (0.003322074264849806, 'region_1_California'),
 (0.002881127051915947, 'taster_name_Matt Kettmann'),
 (0.0028671817907746926, 'style_white'),
 (0.002849675094877142, 'region_1_Napa Valley'),
 (0.002821261645407669, 'taster_name_Anne Krebiehl\xa0MW'),
 (0.002624347383853769, 'country_US'),
 (0.002613955264298594, 'variety_Riesling'),
 (0.0025826923

## Predict 'points_category'

In [21]:
cleanwine_df.columns

Index(['points', 'price', 'vintage_year', 'country_Argentina',
       'country_Armenia', 'country_Australia', 'country_Austria',
       'country_Brazil', 'country_Bulgaria', 'country_Canada',
       ...
       'variety_White Blend', 'variety_White Riesling', 'variety_Zinfandel',
       'style_red', 'style_rosé', 'style_white', 'rating_category_average',
       'rating_category_excellent', 'rating_category_good',
       'rating_category_very good'],
      dtype='object', length=1822)

In [22]:
# Create our features for the model
X_cat = cleanwine_df.drop(columns = ['rating_category_excellent'], axis = 1)

# Create the target
y_cat = cleanwine_df[['rating_category_excellent']]

In [23]:
# Splitting into Train and Test sets.
X_cat_train, X_cat_test, y_cat_train, y_cat_test = train_test_split(X_cat, y_cat, random_state=27)

In [24]:
# Fitting the Standard Scaler with the training data.
X_cat_scaler = scaler.fit(X_cat_train)

In [25]:
# Scaling the data.
X_cat_train_scaled = X_cat_scaler.transform(X_cat_train)
X_cat_test_scaled = X_cat_scaler.transform(X_cat_test)

In [26]:
# Create a random forest regression.
rf_cat_model = RandomForestClassifier(n_estimators=250, random_state=27)

In [27]:
# Fit the model (this will also take a while)
rf_cat_model = rf_cat_model.fit(X_cat_train_scaled, y_cat_train)

  return fit_method(estimator, *args, **kwargs)


In [28]:
# Make predictions using the testing data.
cat_predictions = rf_cat_model.predict(X_cat_test_scaled)

In [29]:
# Evaluate the model using the r^2 test
r2_score(y_cat_test, cat_predictions)

0.746883221919617

In [30]:
# Evaluate the model using the mean squared error test
mean_squared_error(y_cat_test, cat_predictions)

0.005013341958437778

In [31]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_cat_test, cat_predictions)

In [32]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("------------------------------------")
print("Classification Report")
print(classification_report(y_cat_test, cat_predictions))
print("------------------------------------")
print("r2 Score")
print(r2_score(y_cat_test, cat_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,3581,2434
Actual 1,2391,3961


Accuracy Score : 0.9949866580415623
------------------------------------
Classification Report
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00     12117
         1.0       0.99      0.76      0.86       250

    accuracy                           0.99     12367
   macro avg       0.99      0.88      0.93     12367
weighted avg       0.99      0.99      0.99     12367

------------------------------------
r2 Score
0.746883221919617
