In [1]:
# Set Dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [2]:
# Read in the csv
df1 = pd.read_csv("wine-reviews/winemag-data-130k-v2.csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [3]:
parsed_data = df1[df1.duplicated('description', keep=False)].copy()

In [44]:
parsed_data.dropna(subset=['description', 'points', 'price', 'country'], inplace=True)

In [45]:
df2 = parsed_data[['description','points','price', 'country']]
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18806 entries, 9 to 129913
Data columns (total 4 columns):
description    18806 non-null object
points         18806 non-null int64
price          18806 non-null float64
country        18806 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 734.6+ KB


Unnamed: 0,description,points,price,country
9,This has great depth of flavor with its fresh ...,87,27.0,France
10,"Soft, supple plum envelopes an oaky structure ...",87,19.0,US
11,"This is a dry wine, very spicy, with a tight, ...",87,30.0,France
12,"Slightly reduced, this wine offers a chalky, t...",87,34.0,US
14,Building on 150 years and six generations of w...,87,12.0,US


In [6]:
# 1 -> Points 80 to 84 (Under Average wines)

# 2 -> Points 84 to 88 (Average wines)

# 3 -> Points 88 to 92 (Good wines)

# 4 -> Points 92 to 96 (Very Good wines)

# 5 -> Points 96 to 100 (Excellent wines)

#Transform method taking points as param
def transform_points_simplified(points):
    if points < 84:
        return 1
    elif points >= 84 and points < 88:
        return 2 
    elif points >= 88 and points < 92:
        return 3 
    elif points >= 92 and points < 96:
        return 4 
    else:
        return 5

#Applying transform method and assigning result to new column "points_simplified"
df2 = df2.assign(points_simplified = df2['points'].apply(transform_points_simplified))
df2.head()

Unnamed: 0,description,points,price,country,points_simplified
9,This has great depth of flavor with its fresh ...,87,27.0,France,2
10,"Soft, supple plum envelopes an oaky structure ...",87,19.0,US,2
11,"This is a dry wine, very spicy, with a tight, ...",87,30.0,France,2
12,"Slightly reduced, this wine offers a chalky, t...",87,34.0,US,2
14,Building on 150 years and six generations of w...,87,12.0,US,2


In [7]:
df2['finaltextinput'] = df2['description'] + ' ' + df2['country']
df2.head()
df2.iloc[0,5]

"This has great depth of flavor with its fresh apple and pear fruits and touch of spice. It's off dry while balanced with acidity and a crisp texture. Drink now. France"

In [8]:
def lower_all(input_string):
    return input_string.lower()

Example1 = "This is a Great wine"

print(lower_all(Example1))

this is a great wine


In [9]:
df2["finaltextinput"] = df2["finaltextinput"].apply(lower_all)
df2.iloc[0,5]

"this has great depth of flavor with its fresh apple and pear fruits and touch of spice. it's off dry while balanced with acidity and a crisp texture. drink now. france"

In [32]:
X = df2['finaltextinput']
y = df2['points_simplified']
X2 = df2['price']
vectorizer = CountVectorizer()
vec = vectorizer.fit(X)

In [33]:
with open('winevect_model.pickle', 'wb') as handle:
    pickle.dump(vec, handle)

In [34]:
with open('winevect_model.pickle', 'rb') as handle:
    loaded_vec = pickle.load(handle)

In [35]:
X = loaded_vec.transform(X)

In [11]:
X = vectorizer.transform(X)

In [36]:
print(X)

  (0, 291)	1
  (0, 549)	3
  (0, 630)	1
  (0, 841)	1
  (0, 2479)	1
  (0, 2775)	1
  (0, 3040)	1
  (0, 3063)	1
  (0, 3773)	1
  (0, 3958)	1
  (0, 3983)	1
  (0, 4032)	1
  (0, 4344)	1
  (0, 4516)	1
  (0, 5016)	1
  (0, 5020)	1
  (0, 6444)	1
  (0, 6504)	2
  (0, 6505)	1
  (0, 6848)	1
  (0, 8880)	1
  (0, 9570)	1
  (0, 9617)	1
  (0, 9773)	1
  (0, 10522)	1
  :	:
  (18805, 6532)	1
  (18805, 6711)	1
  (18805, 6804)	1
  (18805, 7377)	1
  (18805, 7793)	1
  (18805, 8711)	1
  (18805, 8741)	1
  (18805, 9453)	1
  (18805, 9481)	1
  (18805, 9582)	2
  (18805, 9584)	1
  (18805, 9607)	1
  (18805, 9617)	1
  (18805, 9667)	1
  (18805, 9681)	1
  (18805, 9711)	2
  (18805, 10105)	1
  (18805, 10120)	1
  (18805, 10148)	1
  (18805, 10582)	1
  (18805, 10586)	1
  (18805, 10610)	1
  (18805, 10660)	1
  (18805, 10710)	2
  (18805, 10711)	1


In [13]:
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))

In [14]:
pd.DataFrame.sparse.from_spmatrix(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10752,10753,10754,10755,10756,10757,10758,10759,10760,10761
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18801,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18802,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18803,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18804,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X2.reset_index(drop = True, inplace = True)

In [16]:
Z =pd.DataFrame.sparse.from_spmatrix(X).join(X2)
Z.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10753,10754,10755,10756,10757,10758,10759,10760,10761,price
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,27.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12.0


In [17]:
import numpy as np 

variable = np.array(Z[:1])

Z[:1]

len(variable[0])

#10763

variable[0][10762]

27.0

In [18]:
#Z.tail
X2.head()


0    27.0
1    19.0
2    30.0
3    34.0
4    12.0
Name: price, dtype: float64

In [19]:
# Training the model
X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size=0.1, random_state=101)
rfc = RandomForestClassifier(verbose = True)
rfc.fit(X_train, y_train)

#save the model 

with open('wine_rfc_model.pickle', 'wb') as handle:
    pickle.dump(rfc, handle)

# with open('filename.pickle', 'rb') as handle:
#     b = pickle.load(handle)


# Testing the model
predictions = rfc.predict(X_test)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    6.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [20]:
testing_score = rfc.score(X_test, y_test)
print(testing_score)

0.9537480063795853


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [21]:
rfc.feature_importances_[10762]

0.029902170370806308

In [23]:
def user_input(input_desc, input_country, input_price):

input_desc = lower_all(input_desc)
input_country = lower_all(input_country)
input_price = [float(input_price)]


vectorizer.transform(input_desc)

IndentationError: expected an indented block (<ipython-input-23-6988b25d2c1a>, line 3)

In [24]:
X_example = vectorizer.transform(["this wine is ripe, smooth, and delectable. enjoy its lovely aromas and flavors of ripe fruit and oak with a well-seasoned roast leg of lamb.. us"])
input_price = [float(90)]
X2_example = pd.Series(input_price, name = "price")

In [25]:
print(X_example)

  (0, 549)	3
  (0, 690)	1
  (0, 3288)	1
  (0, 3777)	1
  (0, 4024)	1
  (0, 5006)	1
  (0, 5020)	1
  (0, 5251)	1
  (0, 5373)	1
  (0, 5596)	1
  (0, 6469)	1
  (0, 6504)	2
  (0, 7937)	2
  (0, 7970)	1
  (0, 8322)	1
  (0, 8718)	1
  (0, 9617)	1
  (0, 10148)	1
  (0, 10494)	1
  (0, 10582)	1
  (0, 10610)	1


In [26]:
pd.DataFrame.sparse.from_spmatrix(X_example)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10752,10753,10754,10755,10756,10757,10758,10759,10760,10761
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
X2.reset_index(drop = True, inplace = True)

In [28]:
print(X2)

0        27.0
1        19.0
2        30.0
3        34.0
4        12.0
         ... 
18801    20.0
18802    20.0
18803    19.0
18804    60.0
18805    44.0
Name: price, Length: 18806, dtype: float64


In [29]:
Z_example =pd.DataFrame.sparse.from_spmatrix(X_example).join(X2_example)

In [30]:
X2_example 


0    90.0
Name: price, dtype: float64

In [31]:
rfc.predict(Z_example)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


array([3], dtype=int64)

In [71]:
with open('wine_rfc_model.pickle', 'rb') as handle:
    loaded_rfc = pickle.load(handle)


In [72]:
loaded_rfc.predict(Z_example)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


array([3], dtype=int64)