In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sb
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

plt.style.set = "ggplot2"

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pavli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
dataset = pd.read_csv("dataset/winemag-data-130k-v2.csv")

In [5]:
dataset['designation'] = dataset['designation'].fillna('Unknown')
wine = dataset.drop(['Unnamed: 0','region_1','region_2','taster_twitter_handle'], axis=1)
Q1 = wine['price'].quantile(0.25)
Q3 = wine['price'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

wine_filtered = wine[(wine['price'] >= lower_bound) & (wine['price'] <= upper_bound)]

wine_filtered["taster_name"] = wine_filtered['taster_name'].fillna("Unknown")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  wine_filtered["taster_name"] = wine_filtered['taster_name'].fillna("Unknown")


In [6]:
wine_filtered.shape

(113734, 10)

In [7]:
wine_filtered = wine_filtered.reset_index(drop=True)
wine_filtered_desc = wine_filtered
wine_filtered = wine_filtered.drop("description", axis=1)

In [8]:
wine_filtered_desc.shape

(113734, 10)

In [9]:
corpus = []
for i in range(0, len(wine_filtered)):
    description = re.sub('[^a-zA-Z]', ' ', wine_filtered_desc['description'][i])
    description = description.lower().split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    description = [ps.stem(word) for word in description if not word in set(all_stopwords)]
    description = ' '.join(description)
    corpus.append(description)

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 2000)
words_array = cv.fit_transform(corpus).toarray()
words_array.shape

(113734, 2000)

In [11]:
words_df = pd.DataFrame(words_array)
X = pd.concat([words_df.reset_index(drop=True), wine_filtered.reset_index(drop=True)], axis=1)
X.shape

(113734, 2009)

In [12]:
Y = wine_filtered["points"]
X = X.drop("points", axis=1)
X_new = X.drop(["winery", "designation", "title"], axis=1)
X_new.shape

(113734, 2005)

In [13]:
X_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1995,1996,1997,1998,1999,country,price,province,taster_name,variety
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,Portugal,15.0,Douro,Roger Voss,Portuguese Red
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,US,14.0,Oregon,Paul Gregutt,Pinot Gris
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,US,13.0,Michigan,Alexander Peartree,Riesling
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,US,65.0,Oregon,Paul Gregutt,Pinot Noir
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,Spain,15.0,Northern Spain,Michael Schachner,Tempranillo-Merlot
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,Germany,28.0,Mosel,Anna Lee C. Iijima,Riesling
113730,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,US,75.0,Oregon,Paul Gregutt,Pinot Noir
113731,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,France,30.0,Alsace,Roger Voss,Gewürztraminer
113732,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,France,32.0,Alsace,Roger Voss,Pinot Gris


In [14]:
pd.DataFrame(X_new).columns.astype(str)
#drop 5 columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '1995', '1996', '1997', '1998', '1999', 'country', 'price', 'province',
       'taster_name', 'variety'],
      dtype='object', length=2005)

In [15]:
X_new.columns = pd.DataFrame(X_new).columns.astype(str)

In [16]:
X_new.shape

(113734, 2005)

In [17]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(categories='auto', handle_unknown = 'ignore'), ["country", "province", "taster_name", "variety"])], remainder='passthrough')
X_new = np.array(ct.fit_transform(X_new))
X_new.shape

(113734, 3179)

In [18]:
X_new.shape

(113734, 3179)

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_new, Y, test_size=0.25, random_state=1)
from xgboost import XGBRegressor
regressor = XGBRegressor(enable_categorical=True)
regressor.fit(X_train, y_train)
y_hat = regressor.predict(X_test)

In [20]:
X_train.shape

(85300, 3179)

In [21]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
print("r2: ", r2_score(y_train,regressor.predict(X_train)), "rmse: ", mean_squared_error(y_train, regressor.predict(X_train)))
print("r2: ", r2_score(y_test, y_hat), "rmse: ", mean_squared_error(y_test, regressor.predict(X_test)))

r2:  0.7511932253837585 rmse:  2.107327699661255
r2:  0.7012837529182434 rmse:  2.494298219680786


In [46]:
import pickle

with open("trained_model.pkl", 'wb') as file:  
    pickle.dump(regressor, file)

In [23]:
with open("vectorizer.pkl", "wb") as file:
    pickle.dump(cv, file)

with open("transformer.pkl", "wb") as file:
    pickle.dump(ct, file)

In [24]:
words_array.size

227468000

In [25]:
X_new.size

361560386

In [26]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# Example new data row
new_data = {
    "country": "US",
    "description": "This wine has a fresh aroma of citrus and vanilla, with a smooth texture.",
    "designation": "Example Designation",
    "price": 35.0,
    "province": "California",
    "taster_name": "Roger Voss",
    "title": "Example Wine Title",
    "variety": "Chardonnay",
    "winery": "Example Winery",
}

# Convert to DataFrame
new_df = pd.DataFrame([new_data])

In [27]:
new_df.shape

(1, 9)

In [28]:
# **Step 1: Preprocess the description**
corpus2 = []
description2 = re.sub('[^a-zA-Z]', ' ', new_df['description'][0])
description2 = description2.lower().split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')  # Keeping "not" as in training
description2 = [ps.stem(word) for word in description2 if word not in set(all_stopwords)]
description2 = ' '.join(description2)
corpus2.append(description2)

In [29]:
corpus2

['wine fresh aroma citru vanilla smooth textur']

In [30]:
# Convert description to word vector
#words_array = cv.transform(corpus2).toarray()  # Use the same `cv` fitted in training
words_array2 = cv.transform(corpus2).toarray()  # Use the same `cv` fitted in training
#words_array2 = cv.transform([description2]).toarray()
words_df2 = pd.DataFrame(words_array2)

In [31]:
words_array2.shape

(1, 2000)

In [32]:
words_df2.shape

(1, 2000)

In [33]:
words_df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
X2 = pd.concat([words_df.reset_index(drop=True), new_df.reset_index(drop=True)], axis=1)
X2.shape

(113734, 2009)

In [35]:
X_new2 = X2.drop(["description", "designation", "title", "winery"], axis=1)
X_new2.shape

(113734, 2005)

In [36]:
X_new2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1995,1996,1997,1998,1999,country,price,province,taster_name,variety
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,US,35.0,California,Roger Voss,Chardonnay
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,,,,
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,,,,
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,,,,
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,,,,
113730,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,,,,
113731,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,,,,
113732,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,,,,,


In [37]:
X_new2 = X_new2.fillna(0)
X_new2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1995,1996,1997,1998,1999,country,price,province,taster_name,variety
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,US,35.0,California,Roger Voss,Chardonnay
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113729,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
113730,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
113731,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0
113732,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0,0,0


In [38]:
pd.DataFrame(X_new2).columns.astype(str)

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '1995', '1996', '1997', '1998', '1999', 'country', 'price', 'province',
       'taster_name', 'variety'],
      dtype='object', length=2005)

In [39]:
X_new2.columns = pd.DataFrame(X_new2).columns.astype(str)

In [40]:
X_new2.shape

(113734, 2005)

In [41]:
X_new2.isnull().sum()

0              0
1              0
2              0
3              0
4              0
              ..
country        0
price          0
province       0
taster_name    0
variety        0
Length: 2005, dtype: int64

In [42]:
# **Step 2: Drop unnecessary columns**
#new_df = new_df.drop(["winery", "designation", "title", "description"], axis=1)

# **Step 3: One-hot encode categorical variables**
#new_df.columns = new_df.columns.astype(str)  # Convert column names to string (if needed)
X_new2_transformed = np.array(ct.transform(X_new2))  # Use the fitted `ct` transformer

In [43]:
X_new2_transformed.shape

(113734, 3179)

In [44]:
predicted_score = regressor.predict(X_new2_transformed)

print("Predicted Wine Score:", predicted_score[0])

Predicted Wine Score: 88.78049
