In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import re
import zipfile


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from imblearn.pipeline import Pipeline as ImbalancedPipeline
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import r2_score



import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

In [2]:
# Opening zipfile and reading it to a dataframe
with zipfile.ZipFile('wine_reviews_clean.zip', 'r') as zipf:
    zipf.extractall('')

df = pd.read_csv('wine_reviews_clean.csv')

os.remove('wine_reviews_clean.csv')

df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421017 entries, 0 to 421016
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Name                421017 non-null  object 
 1   Variety             421017 non-null  object 
 2   Country             421017 non-null  object 
 3   Region              386109 non-null  object 
 4   Zone                253609 non-null  object 
 5   Attr_1              417986 non-null  object 
 6   Attr_2              76720 non-null   object 
 7   Winemaker_notes     95234 non-null   object 
 8   Review              93840 non-null   object 
 9   Alcohol_percentage  420952 non-null  float64
 10  Alcohol_vol         420952 non-null  float64
 11  Avg_rating          45970 non-null   float64
 12  N_ratings           45970 non-null   float64
 13  Price_Feature       419607 non-null  float64
 14  Year                418078 non-null  float64
dtypes: float64(6), object(9)
memory us

Unnamed: 0,Name,Variety,Country,Region,Zone,Attr_1,Attr_2,Winemaker_notes,Review,Alcohol_percentage,Alcohol_vol,Avg_rating,N_ratings,Price_Feature,Year
0,M. Chapoutier La Combe Pilate Esteban Brut Nature,Vintage Sparkling Wine,France,Rhone,,Sparkling & Champagne,Green Wine,,,10.0,750.0,5.0,15.0,49.99,2020.0
1,Vinos de Arganza Alvarez de Toledo Godello,Godello,Spain,,,White Wine,,This wine has a bright yellow color with fresh...,,0.0,750.0,5.0,13.0,10.99,2020.0
2,Louis Latour Vosne-Romanee,Pinot Noir,France,Burgundy,Cote d'Or,Red Wine,Boutique,,"Toasty, spicy aromas introduce this rich, silk...",0.0,750.0,5.0,20.0,119.99,2019.0
3,Chateau Bouscaut,Bordeaux Red Blends,France,Bordeaux,Pessac-Leognan,Red Wine,,"Blend: 61% Merlot, 33% Cabernet Sauvignon, 6% ...",This estate continues to produce brilliant win...,14.5,750.0,5.0,17.0,44.99,2020.0
4,Brandini Barolo La Morra,Nebbiolo,Italy,Piedmont,Barolo,Red Wine,Green Wine,"Made with organically farmed fruit, the La Mor...","Aromas of cherries, strawberries and tar with ...",14.0,750.0,5.0,19.0,63.99,2015.0


In [3]:
# Inspecting the numerical columns to check whether is need to engineer this features
df3 = df.copy()
df3.select_dtypes('float64').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421017 entries, 0 to 421016
Data columns (total 6 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Alcohol_percentage  420952 non-null  float64
 1   Alcohol_vol         420952 non-null  float64
 2   Avg_rating          45970 non-null   float64
 3   N_ratings           45970 non-null   float64
 4   Price_Feature       419607 non-null  float64
 5   Year                418078 non-null  float64
dtypes: float64(6)
memory usage: 19.3 MB


In [4]:
# Check for outliers
def get_outliers(data, threshold=3):
    z_scores = np.abs(stats.zscore(data, nan_policy='omit'))
    outliers = np.where(z_scores > threshold, np.nan, 0)
    return outliers

outliers = df3.select_dtypes('float64').apply(get_outliers, axis=0)

outliers_df = df3.join(outliers, rsuffix='_IsOutlier')

outliers_df.iloc[:, -6:].notna().sum()

Alcohol_percentage_IsOutlier    420998
Alcohol_vol_IsOutlier           415141
Avg_rating_IsOutlier            420717
N_ratings_IsOutlier             420362
Price_Feature_IsOutlier         412648
Year_IsOutlier                  418243
dtype: int64

In [5]:
# Eliminate this outliers
clean_df = outliers_df.dropna(subset=outliers_df.columns.tolist()[-6:])
clean_df = clean_df.drop(outliers_df.columns.tolist()[-6:], axis=1)

clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 405906 entries, 0 to 421016
Data columns (total 15 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Name                405906 non-null  object 
 1   Variety             405906 non-null  object 
 2   Country             405906 non-null  object 
 3   Region              371395 non-null  object 
 4   Zone                241475 non-null  object 
 5   Attr_1              403100 non-null  object 
 6   Attr_2              68063 non-null   object 
 7   Winemaker_notes     89805 non-null   object 
 8   Review              84430 non-null   object 
 9   Alcohol_percentage  405841 non-null  float64
 10  Alcohol_vol         405841 non-null  float64
 11  Avg_rating          44313 non-null   float64
 12  N_ratings           44313 non-null   float64
 13  Price_Feature       405168 non-null  float64
 14  Year                403193 non-null  float64
dtypes: float64(6), object(9)
memory usage: 

## Numerical Regression

Experiment to try and predict price_feature with linear regression, using only the numerical features for training.

In [7]:
df7 = clean_df.copy()
df7 = df7.iloc[:, 9:]
df7['Price'] = df7['Price_Feature']

exp1 = df7.drop(['Price_Feature'], axis=1).dropna()
exp1.head()

X = exp1.iloc[:,:5]
y = exp1.iloc[:,5]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)


preprocessor = ColumnTransformer(
    transformers=[
        ('Alcohol_percentage', MinMaxScaler(), ['Alcohol_percentage']),
        ('Alcohol_vol', MinMaxScaler(), ['Alcohol_vol']),
        ('Year', MinMaxScaler(), ['Year'])
    ],
    remainder='passthrough',  # Pass through other columns as is
    verbose=True
)

# Cross validation
kf = KFold(n_splits=5)

# Create pipeline applying thew preprocessor
reg_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regression', LinearRegression())
])

# Perform k-fold cross-validation
r2_scores = []
for train_index, test_index in kf.split(X_train, y_train):
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Fit and evaluate the model for each fold
    reg_model.fit(X_fold_train, y_fold_train)
    y_fold_pred = reg_model.predict(X_fold_val)
    r2_fold = r2_score(y_fold_val, y_fold_pred)
    r2_scores.append(r2_fold)

# Calculate the mean R^2 score across folds
mean_r2 = np.mean(r2_scores)

print(f'Mean R^2 score for linear regression model with k-fold cross-validation: {mean_r2}')


[ColumnTransformer]  (1 of 4) Processing Alcohol_percentage, total=   0.0s
[ColumnTransformer] ... (2 of 4) Processing Alcohol_vol, total=   0.0s
[ColumnTransformer] .......... (3 of 4) Processing Year, total=   0.0s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s
[ColumnTransformer]  (1 of 4) Processing Alcohol_percentage, total=   0.0s
[ColumnTransformer] ... (2 of 4) Processing Alcohol_vol, total=   0.0s
[ColumnTransformer] .......... (3 of 4) Processing Year, total=   0.0s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s
[ColumnTransformer]  (1 of 4) Processing Alcohol_percentage, total=   0.0s
[ColumnTransformer] ... (2 of 4) Processing Alcohol_vol, total=   0.0s
[ColumnTransformer] .......... (3 of 4) Processing Year, total=   0.0s
[ColumnTransformer] ..... (4 of 4) Processing remainder, total=   0.0s
[ColumnTransformer]  (1 of 4) Processing Alcohol_percentage, total=   0.0s
[ColumnTransformer] ... (2 of 4) Processing Alcohol_vol, tota

When the $r^2 \approx 0.1947 = 19.47 \%$ it indicates that our model explains only that small of a portion of the variance in  the target variable, i.e. it's notr very effective in catching underlying patterns of the data and making accurate predictions.

This kinda makes sense since we know that our numerical features are not heavily correlated. Therefore, we should try to add the categorical features and see if it makes a difference.

##  Numerical and Categorical

In [36]:
df8 = clean_df.copy()
df8['Price'] = df8['Price_Feature']

# Remove text columns and Price_feature + drop n/a rows

exp2 = df8.drop(['Name', 'Winemaker_notes', 'Review', 'Attr_2', 'Price_Feature'], axis=1).dropna()  # -> shape(9857,12)
categorical_cols = ['Variety', 'Country', 'Region', 'Attr_1']
numerical_cols = ['Alcohol_percentage', 'Alcohol_vol', 'Avg_rating', 'N_ratings', 'Year']



X2 = exp2.iloc[:,:10]
y2 = exp2.iloc[:,10]

categorical_cols = ['Variety', 'Country', 'Region', 'Attr_1']
numerical_cols = ['Alcohol_percentage', 'Alcohol_vol', 'Avg_rating', 'N_ratings', 'Year']


# Create a ColumnTransformer to apply transformations to different column types
preprocessor = ColumnTransformer(
    transformers=[
        ('Variety', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Variety']),
        ('Country', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Country']),
        ('Region', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Region']),
        ('Attr_1', OneHotEncoder(sparse=False, handle_unknown='ignore'), ['Attr_1']),
        ('Numericals', MinMaxScaler(), numerical_cols)
    ],
    remainder='passthrough',
    verbose=True
)

# Create a pipeline including the preprocessing and the linear regression model
reg_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regression', LinearRegression())
])

# Split your data
X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.20, random_state=0)



# Initialize KFold for cross-validation
kf = KFold(n_splits=5)

r2_scores = []
for train_index, test_index in kf.split(X_train):
    X_fold_train, X_fold_val = X_train.iloc[train_index], X_train.iloc[test_index]
    y_fold_train, y_fold_val = y_train.iloc[train_index], y_train.iloc[test_index]
    
    # Fit and evaluate the model for each fold
    reg_model.fit(X_fold_train, y_fold_train)
    y_fold_pred = reg_model.predict(X_fold_val)
    r2_fold = r2_score(y_fold_val, y_fold_pred)
    r2_scores.append(r2_fold)

# Calculate the mean R^2 score across folds
mean_r2 = np.mean(r2_scores)

print(f'Mean R^2 score for linear regression model with k-fold cross-validation: {mean_r2}')



[ColumnTransformer] ....... (1 of 6) Processing Variety, total=   0.0s
[ColumnTransformer] ....... (2 of 6) Processing Country, total=   0.0s
[ColumnTransformer] ........ (3 of 6) Processing Region, total=   0.0s
[ColumnTransformer] ........ (4 of 6) Processing Attr_1, total=   0.0s
[ColumnTransformer] .... (5 of 6) Processing Numericals, total=   0.0s
[ColumnTransformer] ..... (6 of 6) Processing remainder, total=   0.0s


ValueError: could not convert string to float: 'Asti'

In [34]:
exp2[categorical_cols].info()

<class 'pandas.core.frame.DataFrame'>
Index: 25748 entries, 2 to 405486
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Variety  25748 non-null  object
 1   Country  25748 non-null  object
 2   Region   25748 non-null  object
 3   Attr_1   25748 non-null  object
dtypes: object(4)
memory usage: 1005.8+ KB


In [16]:
X2.head()

Unnamed: 0,Variety,Country,Region,Zone,Attr_1,Alcohol_percentage,Alcohol_vol,Avg_rating,N_ratings,Year
2,Pinot Noir,France,Burgundy,Cote d'Or,Red Wine,0.0,750.0,5.0,20.0,2019.0
3,Bordeaux Red Blends,France,Bordeaux,Pessac-Leognan,Red Wine,14.5,750.0,5.0,17.0,2020.0
4,Nebbiolo,Italy,Piedmont,Barolo,Red Wine,14.0,750.0,5.0,19.0,2015.0
5,Muscat,Italy,Piedmont,Asti,White Wine,5.0,750.0,5.0,13.0,2014.0
7,Rhone Red Blends,France,Rhone,Chateauneuf-du-Pape,Red Wine,0.0,750.0,5.0,6.0,2017.0
