# Shop Recommendation based on item ratings

# 1.Importing libraries:

In [2]:
#Install PySpark Packages
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

In [3]:
#Import Libraries
import numpy as np
import pandas as pd

#Import Libraries for Visualization
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn')

#Import Warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
#Import os for PySpark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

In [6]:
#Import PySpark
import findspark
findspark.init()
from pyspark.sql import SparkSession

#Instantiate the SparkSession Object
spark = SparkSession\
        .builder\
        .appName('ALSExample').config('spark.driver.host', 'localhost')\
        .getOrCreate()

In [8]:
#Load the Datasets into the PySpark DataFrame
ratings = spark.read.csv('/content/ratings.csv', header='true', inferSchema='true')
items = spark.read.csv('/content/items.csv', header='true', inferSchema='true')

In [9]:
#Drop Timestamp from ratings
ratings = ratings.drop('timestamp')

# Preparing a Pandas df

In [10]:
itemsdf = pd.read_csv('items.csv')
ratingsdf = pd.read_csv('ratings.csv')
#Merge Ratings and Movies on movieId
merged_df = pd.merge(itemsdf,ratingsdf, on=['itemId'])

# Exploring the Data

In [11]:
#1.What are the average User Ratings?
rating_counts = merged_df['rating'].value_counts()
sorted_counts = {k: v for k, v in sorted(rating_counts.items(), key=lambda item: item[1])}
sorted_count = list(sorted_counts.items())
cy = [item[1] for item in sorted_count]
cx = [item[0] for item in sorted_count]

plt.figure(figsize=(13,8))
sns.barplot(x=cx, y=cy)
plt.title('User Rating Counts', fontweight='bold', fontsize=18)
plt.xlabel('User Ratings',fontsize=14)
plt.ylabel('Value Counts')
plt.show()
print(merged_df['vendor'].value_counts(normalize=True))

#2.Which Genres tend to get highest ratings?
merged_df['company'].value_counts()
#Many items have company grouping, they will need to be separated and classified to identify each individual company
merged_df['company'] = merged_df['company'].apply(lambda x: x.split("|") if x else x)
merged_df.head()

all_companies = set()
for company in merged_df['companies']:
    if genres:
        all_companies.update(companies)
        
#Prepare company columns for genre counts
for company in all_genres:
  merged_df[company] = np.zeros(shape=merged_df.shape[0])

#Iterate through the items and update the company columns to 1 if the item contains that company
for index, row in merged_df.iterrows():
    if row['companies']:
        for genre in row['companies']:
            merged_df.loc[index, genre] = 1
            
descriptions = pd.DataFrame([])
descriptions['description'] = itemsdf['description']


for company in all_companies:
  print(companies)
  print(merged_df.loc[merged_df[company]==1, ['rating']].mean())
    

company_rating = []
companies = []
for company in all_companies:
    companies.append(company)
    company_rating.append(merged_df.loc[merged_df[company]==1, ['rating']].mean())
company_rating

plt.figure(figsize=(21,13))
sns.barplot(x=companies, y=company_rating)
plt.title('Average Rating by company', fontweight='bold', fontsize=18)
plt.xlabel('company',fontsize=14)
plt.ylabel('Average Rating')
plt.show()

#3.Which items are the Highest Rated?