## WINE LIST RECOMMENDER PROJECT

### USER RATING DATASET

**SUMMARY:**

As research indicated that past scrapes of user data from Vivino was quite time consuming, up to approximately 5 months to capture a portion of the database, we are forced to run a model on simulated data since we do not have that timeframe prior to the deadline for the capstone project.  This notebook creates our simulated user rating data for 1000 profiles.

**IMPORT PACKAGES**

In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

**READ IN WINE CATALOG FILE**

In [2]:
user_df=pd.read_csv('./data/wine_catalog.csv')

In [3]:
user_df.head()

Unnamed: 0.1,Unnamed: 0,winery,wine,rating,wine_style,wine_category,vintage,region,country,price,contact_for_price,wine_style_code,wine_category_code,region_code,country_code
0,0,Caymus,Cabernet Sauvignon N.V.,4.6,Cabernet Sauvignon,Red,N.V.,Napa Valley,United States,79.05,,15,2,587,36
1,1,The Prisoner,The Prisoner N.V.,4.4,Blend,Red,N.V.,Napa Valley,United States,39.75,,10,2,587,36
2,2,Masi,Costasera Amarone della Valpolicella Classico ...,4.3,Valpolicella,Red,N.V.,Amarone della Valpolicella Classico,Italy,54.99,,86,2,34,19
3,3,Stag's Leap Wine Cellars,ARTEMIS Cabernet Sauvignon N.V.,4.3,Cabernet Sauvignon,Red,N.V.,Napa Valley,United States,59.99,,15,2,587,36
4,4,Banfi,Brunello di Montalcino N.V.,4.2,Brunello,Red,N.V.,Brunello di Montalcino,Italy,65.98,,12,2,120,19


**ISOLATE RATINGS DATA**

In [4]:
user_df.drop(['Unnamed: 0','winery','wine','wine_style','wine_category','vintage','region','country','price','contact_for_price','wine_style_code','wine_category_code','region_code','country_code'],axis=1,inplace=True)

In [5]:
user_df

Unnamed: 0,rating
0,4.6
1,4.4
2,4.3
3,4.3
4,4.2
...,...
33158,3.6
33159,3.6
33160,3.6
33161,3.5


**GENERATE RATINGS INFORMATION FOR 500 PROFILES**

In [6]:
#This function will randomly generate ratings within a tight range to maintain our primary ratings average
def new_users(u):
    count=0
    count=count+1
    for i in range(u):
        user_df[f'user-{i}'] = user_df['rating'].apply(lambda x: round(random.uniform(x-.35, x+.35),1))

In [7]:
new_users(1000)

In [8]:
user_df

Unnamed: 0,rating,user-0,user-1,user-2,user-3,user-4,user-5,user-6,user-7,user-8,...,user-990,user-991,user-992,user-993,user-994,user-995,user-996,user-997,user-998,user-999
0,4.6,4.7,4.3,4.7,4.6,4.9,4.8,4.4,4.3,4.7,...,4.6,4.4,4.8,4.8,4.4,4.7,4.6,4.8,4.6,4.6
1,4.4,4.5,4.3,4.5,4.2,4.5,4.4,4.4,4.2,4.2,...,4.2,4.6,4.2,4.4,4.1,4.6,4.4,4.1,4.2,4.3
2,4.3,4.2,4.1,4.6,4.5,4.2,4.6,4.0,4.4,4.1,...,4.3,4.2,4.3,4.3,4.1,4.5,4.0,4.1,4.6,4.6
3,4.3,4.2,4.2,4.5,4.6,4.3,4.3,4.6,4.6,4.5,...,4.3,4.2,4.1,4.3,4.6,4.2,4.5,4.0,4.2,4.1
4,4.2,4.3,4.2,4.1,4.4,4.1,4.2,4.4,4.5,4.5,...,4.4,4.5,4.1,4.1,4.3,4.5,4.3,4.0,4.4,4.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33158,3.6,3.8,3.5,3.8,3.4,3.7,3.8,3.6,3.3,3.6,...,3.5,3.7,3.8,3.8,3.5,3.6,3.9,3.3,3.4,3.9
33159,3.6,3.3,3.9,3.8,3.3,3.4,3.8,3.8,3.7,3.8,...,3.3,3.8,3.8,3.3,3.5,3.5,3.3,3.3,3.4,3.4
33160,3.6,3.7,3.9,3.8,3.7,3.6,3.4,3.3,3.7,3.4,...,3.5,3.7,3.4,3.8,3.6,3.9,3.6,3.9,3.9,3.4
33161,3.5,3.8,3.4,3.8,3.4,3.7,3.7,3.6,3.7,3.2,...,3.2,3.6,3.8,3.4,3.5,3.6,3.8,3.8,3.3,3.5


In [9]:
user_df.mean(axis=1)

0        4.602697
1        4.400500
2        4.295504
3        4.304995
4        4.205495
           ...   
33158    3.598701
33159    3.593806
33160    3.605894
33161    3.495405
33162    3.498102
Length: 33163, dtype: float64

In [10]:
user_df.to_csv('./data/user.csv')

**SIMULATE RATINGS ON SPECIFIC BOTTLES OF WINE BY RANDOMLY APPLYING NaN TO 50% of DATAFRAME**

In [11]:
user_df.count().sum()

33196163

In [12]:
#approach modified from "https://cmdlinetips.com/2019/05/how-to-randomly-add-nan-to-pandas-dataframe/"
nan_mat=np.random.random(user_df.shape)<0.5
nan_mat

array([[ True,  True,  True, ..., False,  True, False],
       [False, False, False, ..., False, False, False],
       [False,  True, False, ...,  True,  True,  True],
       ...,
       [False,  True, False, ..., False, False,  True],
       [ True,  True,  True, ..., False,  True,  True],
       [False,  True,  True, ...,  True, False, False]])

In [13]:
nan_mat.sum()

16594397

In [14]:
user_df2=user_df.mask(nan_mat)

In [15]:
user_df2

Unnamed: 0,rating,user-0,user-1,user-2,user-3,user-4,user-5,user-6,user-7,user-8,...,user-990,user-991,user-992,user-993,user-994,user-995,user-996,user-997,user-998,user-999
0,,,,4.7,4.6,4.9,4.8,,4.3,,...,,,,4.8,4.4,4.7,,4.8,,4.6
1,4.4,4.5,4.3,,,,,4.4,,4.2,...,,,,,4.1,,4.4,4.1,4.2,4.3
2,4.3,,4.1,,,4.2,4.6,,,,...,4.3,4.2,,4.3,,4.5,4.0,,,
3,,,,4.5,,,4.3,,,4.5,...,,4.2,4.1,4.3,4.6,4.2,4.5,,4.2,
4,4.2,4.3,4.2,,4.4,4.1,4.2,,,,...,4.4,,,4.1,,4.5,4.3,4.0,,4.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33158,3.6,3.8,,3.8,,3.7,,,,3.6,...,3.5,3.7,3.8,3.8,3.5,,3.9,3.3,3.4,3.9
33159,,3.3,3.9,3.8,,,,3.8,,,...,3.3,,,3.3,,,,,3.4,
33160,3.6,,3.9,,,,,,3.7,,...,,3.7,3.4,,,,,3.9,3.9,
33161,,,,,3.4,3.7,3.7,,3.7,,...,3.2,,3.8,,,3.6,,3.8,,


In [16]:
user_df2.drop(['rating'],axis=1, inplace=True)

In [17]:
user_df2.fillna(0.0)

Unnamed: 0,user-0,user-1,user-2,user-3,user-4,user-5,user-6,user-7,user-8,user-9,...,user-990,user-991,user-992,user-993,user-994,user-995,user-996,user-997,user-998,user-999
0,0.0,0.0,4.7,4.6,4.9,4.8,0.0,4.3,0.0,0.0,...,0.0,0.0,0.0,4.8,4.4,4.7,0.0,4.8,0.0,4.6
1,4.5,4.3,0.0,0.0,0.0,0.0,4.4,0.0,4.2,0.0,...,0.0,0.0,0.0,0.0,4.1,0.0,4.4,4.1,4.2,4.3
2,0.0,4.1,0.0,0.0,4.2,4.6,0.0,0.0,0.0,0.0,...,4.3,4.2,0.0,4.3,0.0,4.5,4.0,0.0,0.0,0.0
3,0.0,0.0,4.5,0.0,0.0,4.3,0.0,0.0,4.5,0.0,...,0.0,4.2,4.1,4.3,4.6,4.2,4.5,0.0,4.2,0.0
4,4.3,4.2,0.0,4.4,4.1,4.2,0.0,0.0,0.0,3.9,...,4.4,0.0,0.0,4.1,0.0,4.5,4.3,4.0,0.0,4.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33158,3.8,0.0,3.8,0.0,3.7,0.0,0.0,0.0,3.6,3.3,...,3.5,3.7,3.8,3.8,3.5,0.0,3.9,3.3,3.4,3.9
33159,3.3,3.9,3.8,0.0,0.0,0.0,3.8,0.0,0.0,0.0,...,3.3,0.0,0.0,3.3,0.0,0.0,0.0,0.0,3.4,0.0
33160,0.0,3.9,0.0,0.0,0.0,0.0,0.0,3.7,0.0,0.0,...,0.0,3.7,3.4,0.0,0.0,0.0,0.0,3.9,3.9,0.0
33161,0.0,0.0,0.0,3.4,3.7,3.7,0.0,3.7,0.0,0.0,...,3.2,0.0,3.8,0.0,0.0,3.6,0.0,3.8,0.0,0.0


In [18]:
user_df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33163 entries, 0 to 33162
Columns: 1000 entries, user-0 to user-999
dtypes: float64(1000)
memory usage: 253.0 MB


In [19]:
user_df2.mean(axis=1)

0        4.607566
1        4.399594
2        4.284200
3        4.312320
4        4.200387
           ...   
33158    3.603766
33159    3.606557
33160    3.616602
33161    3.491753
33162    3.501408
Length: 33163, dtype: float64

In [20]:
user_df.mean(axis=1)

0        4.602697
1        4.400500
2        4.295504
3        4.304995
4        4.205495
           ...   
33158    3.598701
33159    3.593806
33160    3.605894
33161    3.495405
33162    3.498102
Length: 33163, dtype: float64

**NOTE:** Rating averages are acceptable

In [21]:
user_df2.to_csv('./data/user.csv')

In [22]:
def user_values(series, range_min, range_max):

    # "between" returns a boolean Series equivalent to left <= series <= right.
    # NA values will be treated as False.
    return series.between(left=range_min, right=range_max).sum()

    # Alternative approach:
    # return ((range_min <= series) & (series <= range_max)).sum()

range_min, range_max = 0.5, 5.0

user_df2["rating_counts"] = user_df2.apply(
    func=lambda row: user_values(row, range_min, range_max), axis=1)

print(user_df2["rating_counts"])

0        489
1        493
2        519
3        487
4        517
        ... 
33158    478
33159    488
33160    512
33161    485
33162    497
Name: rating_counts, Length: 33163, dtype: int64


In [23]:
user_df2

Unnamed: 0,user-0,user-1,user-2,user-3,user-4,user-5,user-6,user-7,user-8,user-9,...,user-991,user-992,user-993,user-994,user-995,user-996,user-997,user-998,user-999,rating_counts
0,,,4.7,4.6,4.9,4.8,,4.3,,,...,,,4.8,4.4,4.7,,4.8,,4.6,489
1,4.5,4.3,,,,,4.4,,4.2,,...,,,,4.1,,4.4,4.1,4.2,4.3,493
2,,4.1,,,4.2,4.6,,,,,...,4.2,,4.3,,4.5,4.0,,,,519
3,,,4.5,,,4.3,,,4.5,,...,4.2,4.1,4.3,4.6,4.2,4.5,,4.2,,487
4,4.3,4.2,,4.4,4.1,4.2,,,,3.9,...,,,4.1,,4.5,4.3,4.0,,4.2,517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33158,3.8,,3.8,,3.7,,,,3.6,3.3,...,3.7,3.8,3.8,3.5,,3.9,3.3,3.4,3.9,478
33159,3.3,3.9,3.8,,,,3.8,,,,...,,,3.3,,,,,3.4,,488
33160,,3.9,,,,,,3.7,,,...,3.7,3.4,,,,,3.9,3.9,,512
33161,,,,3.4,3.7,3.7,,3.7,,,...,,3.8,,,3.6,,3.8,,,485


In [24]:
user_values_df = pd.DataFrame(user_df2['rating_counts'])
user_values_df

Unnamed: 0,rating_counts
0,489
1,493
2,519
3,487
4,517
...,...
33158,478
33159,488
33160,512
33161,485


In [25]:
user_values_df.to_csv('./data/rating_counts.csv')