In [2]:
# Set Dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [3]:
# Read in the csv
df1 = pd.read_csv("wine-reviews/winemag-data-130k-v2.csv")

In [7]:
parsed_data = df1[df1.duplicated('description', keep=False)].copy()

In [8]:
parsed_data.dropna(subset=['description', 'points', 'price'], inplace=True)

In [9]:
df2 = parsed_data[['description','points','price']]
df2.info()
df2.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18814 entries, 9 to 129913
Data columns (total 3 columns):
description    18814 non-null object
points         18814 non-null int64
price          18814 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 587.9+ KB


Unnamed: 0,description,points,price
9,This has great depth of flavor with its fresh ...,87,27.0
10,"Soft, supple plum envelopes an oaky structure ...",87,19.0
11,"This is a dry wine, very spicy, with a tight, ...",87,30.0
12,"Slightly reduced, this wine offers a chalky, t...",87,34.0
14,Building on 150 years and six generations of w...,87,12.0


In [11]:
# 1 -> Points 80 to 84 (Under Average wines)

# 2 -> Points 84 to 88 (Average wines)

# 3 -> Points 88 to 92 (Good wines)

# 4 -> Points 92 to 96 (Very Good wines)

# 5 -> Points 96 to 100 (Excellent wines)

#Transform method taking points as param
def transform_points_simplified(points):
    if points < 84:
        return 1
    elif points >= 84 and points < 88:
        return 2 
    elif points >= 88 and points < 92:
        return 3 
    elif points >= 92 and points < 96:
        return 4 
    else:
        return 5

#Applying transform method and assigning result to new column "points_simplified"
df2 = df2.assign(points_simplified = df2['points'].apply(transform_points_simplified))
df2.head()

Unnamed: 0,description,points,price,points_simplified
9,This has great depth of flavor with its fresh ...,87,27.0,2
10,"Soft, supple plum envelopes an oaky structure ...",87,19.0,2
11,"This is a dry wine, very spicy, with a tight, ...",87,30.0,2
12,"Slightly reduced, this wine offers a chalky, t...",87,34.0,2
14,Building on 150 years and six generations of w...,87,12.0,2


In [26]:
X = df2['description']
y = df2['points_simplified']
X2 = df2['price']
vectorizer = CountVectorizer()
vectorizer.fit(X)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [27]:
X = vectorizer.transform(X)

In [28]:
print(X)

  (0, 291)	1
  (0, 549)	3
  (0, 630)	1
  (0, 841)	1
  (0, 2478)	1
  (0, 2772)	1
  (0, 3037)	1
  (0, 3060)	1
  (0, 3769)	1
  (0, 3979)	1
  (0, 4028)	1
  (0, 4340)	1
  (0, 4512)	1
  (0, 5011)	1
  (0, 5015)	1
  (0, 6436)	1
  (0, 6496)	2
  (0, 6497)	1
  (0, 6840)	1
  (0, 8870)	1
  (0, 9559)	1
  (0, 9606)	1
  (0, 9762)	1
  (0, 10510)	1
  (0, 10598)	2
  :	:
  (18813, 6496)	2
  (18813, 6524)	1
  (18813, 6703)	1
  (18813, 6796)	1
  (18813, 7369)	1
  (18813, 7785)	1
  (18813, 8701)	1
  (18813, 8731)	1
  (18813, 9442)	1
  (18813, 9470)	1
  (18813, 9571)	2
  (18813, 9573)	1
  (18813, 9596)	1
  (18813, 9606)	1
  (18813, 9656)	1
  (18813, 9670)	1
  (18813, 9700)	2
  (18813, 10094)	1
  (18813, 10109)	1
  (18813, 10570)	1
  (18813, 10574)	1
  (18813, 10598)	1
  (18813, 10648)	1
  (18813, 10698)	2
  (18813, 10699)	1


In [29]:
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))

AttributeError: to_dense not found

In [35]:
pd.DataFrame.sparse.from_spmatrix(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10740,10741,10742,10743,10744,10745,10746,10747,10748,10749
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18809,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18810,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18811,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18812,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
X2.reset_index(drop = True, inplace = True)

In [50]:
Z =pd.DataFrame.sparse.from_spmatrix(X).join(X2)
Z.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10741,10742,10743,10744,10745,10746,10747,10748,10749,price
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,27.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,30.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,12.0


In [45]:
#Z.tail
X2.head()


(18814,)

In [51]:
# Training the model
X_train, X_test, y_train, y_test = train_test_split(Z, y, test_size=0.1, random_state=101)
rfc = RandomForestClassifier(verbose = True)
rfc.fit(X_train, y_train)

# Testing the model
predictions = rfc.predict(X_test)


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished


In [52]:
testing_score = rfc.score(X_test, y_test)
print(testing_score)

0.9527098831030818


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
