In [1]:
# Set Dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Read in the csv
df1 = pd.read_csv("wine-reviews/winemag-data-130k-v2.csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [22]:
parsed_data = df1[df1.duplicated('description', keep=False)].copy()
parsed_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20026 entries, 9 to 129913
Data columns (total 14 columns):
Unnamed: 0               20026 non-null int64
country                  20018 non-null object
description              20026 non-null object
designation              14162 non-null object
points                   20026 non-null int64
price                    18814 non-null float64
province                 20018 non-null object
region_1                 16648 non-null object
region_2                 7498 non-null object
taster_name              17362 non-null object
taster_twitter_handle    16482 non-null object
title                    20026 non-null object
variety                  20026 non-null object
winery                   20026 non-null object
dtypes: float64(1), int64(2), object(11)
memory usage: 2.3+ MB


In [4]:
parsed_data.dropna(subset=['description', 'points', 'price', 'country', "variety"], inplace=True)
# parsed_data.dropna(subset=['description', 'points', 'price', 'country', "region_1", "variety", "winery"], inplace=True)

In [5]:
df2 = parsed_data[['description','points','price', 'country', "variety"]]
# df2 = parsed_data[['description','points','price', 'country', "region_1", "variety", "winery"]]
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18806 entries, 9 to 129913
Data columns (total 5 columns):
description    18806 non-null object
points         18806 non-null int64
price          18806 non-null float64
country        18806 non-null object
variety        18806 non-null object
dtypes: float64(1), int64(1), object(3)
memory usage: 881.5+ KB


Unnamed: 0,description,points,price,country,variety
9,This has great depth of flavor with its fresh ...,87,27.0,France,Pinot Gris
10,"Soft, supple plum envelopes an oaky structure ...",87,19.0,US,Cabernet Sauvignon
11,"This is a dry wine, very spicy, with a tight, ...",87,30.0,France,Gewürztraminer
12,"Slightly reduced, this wine offers a chalky, t...",87,34.0,US,Cabernet Sauvignon
14,Building on 150 years and six generations of w...,87,12.0,US,Chardonnay


In [6]:
df2['country'].unique()

array(['France', 'US', 'Argentina', 'Italy', 'Chile', 'Germany',
       'Portugal', 'South Africa', 'Hungary', 'Australia', 'Spain',
       'Austria', 'New Zealand', 'Romania', 'Israel', 'Turkey', 'Greece',
       'Slovenia', 'Croatia', 'Georgia', 'England', 'Canada', 'Moldova',
       'Czech Republic', 'Bulgaria', 'Uruguay', 'Morocco', 'Mexico',
       'Lebanon', 'Brazil', 'Serbia', 'Switzerland', 'India',
       'Luxembourg', 'Cyprus', 'Macedonia'], dtype=object)

In [7]:
df2['variety'].unique()

array(['Pinot Gris', 'Cabernet Sauvignon', 'Gewürztraminer', 'Chardonnay',
       'Malbec', 'Merlot', 'Pinot Noir', 'Gamay', 'Red Blend', 'Inzolia',
       'Riesling', 'Sauvignon Blanc', 'Monica',
       'Bordeaux-style White Blend', 'Grillo', 'Syrah', 'Portuguese Red',
       'Sangiovese', 'Tannat-Cabernet', 'Cabernet Franc', 'White Blend',
       'G-S-M', 'Zinfandel', 'Rhône-style Red Blend', 'Fumé Blanc',
       'Furmint', 'Pinot Bianco', 'Syrah-Viognier', 'Shiraz', 'Rosé',
       'Tempranillo', 'Sparkling Blend', 'Grüner Veltliner',
       'Grenache Blanc', 'Nebbiolo', 'Cortese', 'Champagne Blend',
       'Pinot Blanc', 'Glera', 'Pinot Grigio', 'Bonarda', 'Aglianico',
       'Bordeaux-style Red Blend', 'Silvaner', 'Colombard',
       'Tempranillo Blend', 'Portuguese White', 'Tinta Miúda',
       'Corvina, Rondinella, Molinara', "Nero d'Avola", 'Insolia',
       'Papaskarasi', 'Tannat-Syrah', 'Petite Sirah', 'Pinot Nero',
       'Sherry', 'Greco', 'Viura', 'Viognier', 'Sauvignon', '

In [8]:
# 1 -> Points 80 to 84 (Under Average wines)

# 2 -> Points 84 to 88 (Average wines)

# 3 -> Points 88 to 92 (Good wines)

# 4 -> Points 92 to 96 (Very Good wines)

# 5 -> Points 96 to 100 (Excellent wines)

#Transform method taking points as param
def transform_points_simplified(points):
    if points < 84:
        return 1
    elif points >= 84 and points < 88:
        return 2 
    elif points >= 88 and points < 92:
        return 3 
    elif points >= 92 and points < 96:
        return 4 
    else:
        return 5

#Applying transform method and assigning result to new column "points_simplified"
df2 = df2.assign(points_simplified = df2['points'].apply(transform_points_simplified))
df2.head()

Unnamed: 0,description,points,price,country,variety,points_simplified
9,This has great depth of flavor with its fresh ...,87,27.0,France,Pinot Gris,2
10,"Soft, supple plum envelopes an oaky structure ...",87,19.0,US,Cabernet Sauvignon,2
11,"This is a dry wine, very spicy, with a tight, ...",87,30.0,France,Gewürztraminer,2
12,"Slightly reduced, this wine offers a chalky, t...",87,34.0,US,Cabernet Sauvignon,2
14,Building on 150 years and six generations of w...,87,12.0,US,Chardonnay,2


In [9]:
df2['finaltextinput'] = df2['description'] + ' ' + df2['country'] + ' ' + df2['variety'] 
# df2["finaltextinput2"] = df2["description"]+ " " +df2["country"]+ " " +df2["region_1"]+ " " +df2["variety"]+ " " +df2["winery"]
df2.head()
# df2.iloc[0,5]

Unnamed: 0,description,points,price,country,variety,points_simplified,finaltextinput
9,This has great depth of flavor with its fresh ...,87,27.0,France,Pinot Gris,2,This has great depth of flavor with its fresh ...
10,"Soft, supple plum envelopes an oaky structure ...",87,19.0,US,Cabernet Sauvignon,2,"Soft, supple plum envelopes an oaky structure ..."
11,"This is a dry wine, very spicy, with a tight, ...",87,30.0,France,Gewürztraminer,2,"This is a dry wine, very spicy, with a tight, ..."
12,"Slightly reduced, this wine offers a chalky, t...",87,34.0,US,Cabernet Sauvignon,2,"Slightly reduced, this wine offers a chalky, t..."
14,Building on 150 years and six generations of w...,87,12.0,US,Chardonnay,2,Building on 150 years and six generations of w...


In [None]:
# Convert df2 to a sqlite database for reference
from sqlalchemy import create_engine
engine = create_engine('sqlite:///wines.sqlite', echo=False)

In [None]:
# Convert to sqlite
df2.to_sql('wines', con=engine, if_exists="replace")

In [None]:
def lower_all(input_string):
    return input_string.lower()

Example1 = "This is a Great wine"

print(lower_all(Example1))

In [None]:
# df2["finaltextinput"] = df2["finaltextinput"].apply(lower_all)
# df2.iloc[0,8]

### Define variables ###

In [10]:
# For RandomForestClassifier
X11 = df2['finaltextinput']
y11 = df2['points_simplified']

X21 = df2['price']

# For RandomForestRegressor
# X12 = df2['finaltextinput2']
X12 = df2['finaltextinput']
y12 = df2['price']

X22 = df2['points_simplified']

### Random Forest Classifier ###

In [11]:
vectorizer = CountVectorizer()
vec_rfc = vectorizer.fit(X11)

In [12]:
with open('winevect_rfc_model.pickle', 'wb') as handle:
    pickle.dump(vec_rfc, handle)

In [None]:
# with open('winevect_model.pickle', 'rb') as handle:
#     loaded_vec = pickle.load(handle)

In [None]:
# X = loaded_vec.transform(X11)

In [13]:
X = vectorizer.transform(X11)

In [None]:
# print(X)

In [14]:
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))

In [15]:
pd.DataFrame.sparse.from_spmatrix(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10816,10817,10818,10819,10820,10821,10822,10823,10824,10825
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18803,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X21.reset_index(drop = True, inplace = True)

In [17]:
Z1 =pd.DataFrame.sparse.from_spmatrix(X).join(X21)
Z1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10817,10818,10819,10820,10821,10822,10823,10824,10825,price
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,27.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12.0


In [None]:
# import numpy as np 

# variable = np.array(Z[:1])

# Z[:1]

# len(variable[0])

# #10763

# variable[0][10762]

In [None]:
#Z.tail
# X21.head()

In [18]:
# Training the model
X_train, X_test, y_train, y_test = train_test_split(Z1, y11, test_size=0.1, random_state=101)
rfc = RandomForestClassifier(verbose = True)
rfc.fit(X_train, y_train)

#save the model 

with open('wine_rfc_model.pickle', 'wb') as handle:
    pickle.dump(rfc, handle)

# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)


# Testing the model
predictions = rfc.predict(X_test)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [19]:
testing_score = rfc.score(X_test, y_test)
print(testing_score)

0.950026581605529


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [None]:
# rfc.feature_importances_[10762]

In [None]:
# def user_input(input_desc, input_country, input_price)

# input_desc = lower_all(input_desc)
# input_country = lower_all(input_country)
# input_price = [float(input_price)]


# vectorizer.transform(input_desc)

In [None]:
# X_example = vectorizer.transform(["this wine is ripe, smooth, and delectable. enjoy its lovely aromas and flavors of ripe fruit and oak with a well-seasoned roast leg of lamb.. us"])
# input_price = [float(90)]
# X2_example = pd.Series(input_price, name = "price")

In [None]:
# print(X_example)

In [None]:
# pd.DataFrame.sparse.from_spmatrix(X_example)

In [None]:
# X2.reset_index(drop = True, inplace = True)

In [None]:
# print(X2)

In [None]:
# Z_example =pd.DataFrame.sparse.from_spmatrix(X_example).join(X2_example)

In [None]:
# X2_example 


In [None]:
# rfc.predict(Z_example)

In [None]:
# with open('wine_rfc_model.pickle', 'rb') as handle:
#     loaded_rfc = pickle.load(handle)


In [None]:
# loaded_rfc.predict(Z_example)

### Random Forest Regressor ###

In [None]:
vectorizer = CountVectorizer()
vectorizer.fit(X12)
    
X = vectorizer.fit_transform(X12)

In [None]:
with open('winevect_rfr_model.pickle', 'wb') as handle:
    pickle.dump(vec_rfr, handle)

In [None]:
X.shape

In [None]:
# X = vectorizer.transform(X)
print('Shape of Sparse Matrix: ', X.shape)
print('Amount of Non-Zero occurrences: ', X.nnz)

# Percentage of non-zero values
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))
print('Density: {}'.format((density)))

In [None]:
pd.DataFrame.sparse.from_spmatrix(X)

In [None]:
X22.reset_index(drop = True, inplace = True)

In [None]:
Z2 =pd.DataFrame.sparse.from_spmatrix(X).join(X22)
Z2.head()

In [None]:
# Split the data

X_train, X_test, y_train, y_test = train_test_split(Z2, y12, random_state=42)

In [None]:
# Training the model

rfr = RandomForestRegressor(verbose=True, n_jobs=-1)
rfr.fit(X_train, y_train)

#save the model 

with open('wine_rfr_model.pickle', 'wb') as handle:
    pickle.dump(rfr, handle)

# with open('wine_rfr_model.pickle', 'rb') as handle:
#   pickle.load(rfr, handle)


In [None]:
# Testing the model
predictions = rfr.predict(X_test)

In [None]:
training_score_vect = rfr.score(X_train, y_train)
testing_score_vect = rfr.score(X_test, y_test)

print(f"Training Score with Vectorizing: {training_score_vect}")
print(f"Testing Score with Vectorizing: {testing_score_vect}")

In [None]:
# Plot the Residuals for the Training and Testing data with *=* wine3_df *=*

plt.scatter(rfr.predict(X_train), rfr.predict(X_train) - y_train, c="blue", label="Training Data (Vect)")
plt.scatter(rfr.predict(X_test), rfr.predict(X_test) - y_test, c="orange", label="Testing Data (Vect)")
plt.legend()
plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max())
plt.title("Residual Plot")
plt.show()

In [None]:
X_example = vectorizer.transform(["this wine is ripe, smooth, and delectable. enjoy its lovely aromas and flavors of ripe fruit and oak with a well-seasoned roast leg of lamb. us"])
input_points = [float(3)]
X22_example = pd.Series(input_points, name = "points")

In [None]:
pd.DataFrame.sparse.from_spmatrix(X_example)

In [None]:
X22_example.reset_index(drop = True, inplace = True)

In [None]:
Z2_example =pd.DataFrame.sparse.from_spmatrix(X_example).join(X22_example)

In [None]:
rfr.predict(Z2_example)