# Executive Summary:

Since I am planning to run a recommendation system based on the wine ratings, simulating a user rating dataset will allow us to do the recommendation in the matrix format. The procedure will be explained with more details in the next notebook


In [1]:
# Importing Required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import random

In [2]:
wine_df = pd.read_csv('/home/docode/project/EDA on Collected Data/new_wine.csv', index_col = 0) 

user_rating_df = wine_df['wine rating']
user_rating_df = pd.DataFrame(user_rating_df)
user_rating_df

Unnamed: 0,wine rating
0,3.4
1,3.4
2,3.1
3,3.4
4,3.4
...,...
21976,4.1
21977,4.2
21978,3.9
21979,3.8


In [3]:
# Function randomly generates ratings within a tight range. It is used to maintain our primary ratings average

def users(count):
    for i in range(count):
        user_rating_df[f'user-{i}'] = user_rating_df['wine rating'].apply(lambda x: round(random.uniform(x - .3, x + .3), 1))
        
users(1000)
user_rating_df     

Unnamed: 0,wine rating,user-0,user-1,user-2,user-3,user-4,user-5,user-6,user-7,user-8,...,user-990,user-991,user-992,user-993,user-994,user-995,user-996,user-997,user-998,user-999
0,3.4,3.5,3.4,3.7,3.2,3.3,3.1,3.6,3.6,3.3,...,3.3,3.3,3.3,3.6,3.5,3.5,3.5,3.1,3.3,3.7
1,3.4,3.6,3.5,3.1,3.6,3.4,3.2,3.4,3.3,3.6,...,3.1,3.1,3.3,3.4,3.4,3.1,3.4,3.5,3.4,3.3
2,3.1,3.2,2.9,3.0,2.9,2.9,3.2,3.2,2.8,3.4,...,3.3,3.3,2.9,3.1,3.3,3.4,2.9,3.0,3.1,2.9
3,3.4,3.6,3.1,3.2,3.5,3.3,3.5,3.1,3.2,3.5,...,3.6,3.4,3.5,3.3,3.3,3.1,3.1,3.4,3.2,3.2
4,3.4,3.3,3.5,3.5,3.5,3.6,3.6,3.3,3.1,3.4,...,3.5,3.2,3.1,3.5,3.2,3.5,3.4,3.4,3.2,3.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21976,4.1,4.0,4.1,4.4,4.2,3.9,4.1,4.3,4.1,4.3,...,4.0,4.0,4.3,4.1,4.1,3.9,4.3,4.0,3.9,4.2
21977,4.2,4.3,4.3,4.2,3.9,4.0,4.3,4.4,4.0,4.4,...,4.3,3.9,4.4,3.9,4.5,4.3,4.3,4.4,4.4,3.9
21978,3.9,4.0,3.9,4.1,3.6,3.8,3.9,4.2,4.1,3.7,...,4.2,3.7,3.7,3.9,4.1,3.9,4.2,4.0,3.9,4.2
21979,3.8,4.1,3.8,3.7,3.6,3.7,4.0,3.8,4.0,3.9,...,3.6,3.7,3.8,3.9,4.0,4.0,3.7,4.0,3.8,3.9


In [4]:
user_rating_df.mean(axis = 1)

0        3.407692
1        3.395305
2        3.097802
3        3.397303
4        3.402597
           ...   
21976    4.106893
21977    4.187313
21978    3.904496
21979    3.805095
21980    4.095205
Length: 21956, dtype: float64

Mean value of Simulated Wine Ratings are very close to the ratings in the original dataframe 

### Now Lets randomly apply NaN to 50% of DataFrame, this helps to simulate user ratings

In [5]:
# Generate True or False value 
nan_mat=np.random.random(user_rating_df.shape)<0.5

# Use pandas mask function
user_df = user_rating_df.mask(nan_mat)

# Let's Drop the first column
user_df.drop(['wine rating'], axis = 1, inplace = True)

# And Fill NaN with 0.0
user_df.fillna(0.0)



Unnamed: 0,user-0,user-1,user-2,user-3,user-4,user-5,user-6,user-7,user-8,user-9,...,user-990,user-991,user-992,user-993,user-994,user-995,user-996,user-997,user-998,user-999
0,3.5,0.0,3.7,3.2,0.0,0.0,0.0,0.0,3.3,3.2,...,3.3,0.0,0.0,3.6,3.5,3.5,0.0,0.0,3.3,0.0
1,0.0,0.0,0.0,0.0,0.0,3.2,0.0,0.0,0.0,3.4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3.2,2.9,0.0,2.9,0.0,3.2,0.0,2.8,3.4,0.0,...,3.3,3.3,2.9,3.1,0.0,0.0,2.9,3.0,0.0,2.9
3,3.6,3.1,0.0,3.5,3.3,3.5,0.0,3.2,3.5,3.5,...,3.6,3.4,3.5,0.0,0.0,3.1,3.1,3.4,0.0,3.2
4,3.3,0.0,3.5,3.5,0.0,0.0,0.0,3.1,3.4,3.2,...,3.5,0.0,0.0,3.5,3.2,3.5,0.0,3.4,0.0,3.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21976,0.0,4.1,0.0,4.2,3.9,4.1,4.3,0.0,4.3,0.0,...,0.0,4.0,4.3,0.0,0.0,0.0,0.0,4.0,3.9,0.0
21977,0.0,4.3,4.2,3.9,0.0,4.3,4.4,0.0,4.4,4.0,...,4.3,3.9,0.0,0.0,4.5,0.0,4.3,0.0,0.0,3.9
21978,4.0,3.9,4.1,0.0,0.0,3.9,4.2,0.0,3.7,0.0,...,4.2,3.7,0.0,0.0,0.0,0.0,4.2,0.0,3.9,0.0
21979,4.1,0.0,0.0,0.0,3.7,4.0,3.8,4.0,0.0,3.6,...,3.6,0.0,0.0,0.0,4.0,4.0,3.7,4.0,3.8,3.9


In [6]:
user_df.mean(axis = 1)

0        3.405600
1        3.404651
2        3.100998
3        3.403187
4        3.392322
           ...   
21976    4.109486
21977    4.188492
21978    3.903636
21979    3.790283
21980    4.097002
Length: 21956, dtype: float64

We can see that mean values are nearly the same. This means that NaN was applied to 50% of all dataframe entries evenly 

In [7]:
# Now Write the information
user_df.to_csv('User_rating.csv')

In [8]:
# Return result in True/False format
def user_values(series, range_min, range_max):
    return ((range_min <= series) & (series <= range_max)).sum()

range_min, range_max = 0.5, 5 # Minimum and Maximum Rating Ranges 
user_df['rating_counts'] = user_df.apply(lambda row: user_values(row, range_min, range_max), axis = 1 ) # Sums all True values since True = 1 and False = 0

In [9]:
user_df

Unnamed: 0,user-0,user-1,user-2,user-3,user-4,user-5,user-6,user-7,user-8,user-9,...,user-991,user-992,user-993,user-994,user-995,user-996,user-997,user-998,user-999,rating_counts
0,3.5,,3.7,3.2,,,,,3.3,3.2,...,,,3.6,3.5,3.5,,,3.3,,500
1,,,,,,3.2,,,,3.4,...,,,,,,,,,,473
2,3.2,2.9,,2.9,,3.2,,2.8,3.4,,...,3.3,2.9,3.1,,,2.9,3.0,,2.9,501
3,3.6,3.1,,3.5,3.3,3.5,,3.2,3.5,3.5,...,3.4,3.5,,,3.1,3.1,3.4,,3.2,502
4,3.3,,3.5,3.5,,,,3.1,3.4,3.2,...,,,3.5,3.2,3.5,,3.4,,3.3,534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21976,,4.1,,4.2,3.9,4.1,4.3,,4.3,,...,4.0,4.3,,,,,4.0,3.9,,506
21977,,4.3,4.2,3.9,,4.3,4.4,,4.4,4.0,...,3.9,,,4.5,,4.3,,,3.9,504
21978,4.0,3.9,4.1,,,3.9,4.2,,3.7,,...,3.7,,,,,4.2,,3.9,,495
21979,4.1,,,,3.7,4.0,3.8,4.0,,3.6,...,,,,4.0,4.0,3.7,4.0,3.8,3.9,494


In [10]:
user_ratings_df = pd.DataFrame(user_df['rating_counts'])
user_ratings_df

Unnamed: 0,rating_counts
0,500
1,473
2,501
3,502
4,534
...,...
21976,506
21977,504
21978,495
21979,494


In [11]:
user_ratings_df.to_csv('User_Rating_Counts.csv')

# End of Notebook
In this relatively short notebook we have simulated wine ratings that will be used in our Recommendation System