# Visualize the query results that the Spark job produced

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Explore the solution dataset (the songplays table) where the artist and song columns are complete

In [None]:
results = pd.read_csv('../queries/songplays_full_data.csv')
results

There are **319 rows** in the songplays table with complete information (ie. 319 events where a song from the events data matches a song from the songs data). This should be sufficient for some basic analysis.

### What are the top artists from the sparkify data?

In [None]:
results = pd.read_csv('../queries/popular_artists.csv')
# Plot results as a Seaborn barplot (2 subplots)
fig, axes = plt.subplots(1, 1)
#fig.subplots_adjust(hspace=1)
sns.set(style="whitegrid")
sns.set(rc={'figure.figsize':(15,5)})
sns.set(font_scale = 2)
plt.rcParams["xtick.labelsize"] = 16
chart1 = sns.barplot(x="artist", y="total_plays", data=results, ax=axes);
chart1.set_xticklabels(chart1.get_xticklabels(), rotation=40, horizontalalignment='right');
chart1.set_title("Top 10 artists from Sparkify dataset: All users", weight = "bold");
chart1.set(xlabel=None)

### What are the top listening locations from the sparkify data?

In [None]:
results = pd.read_csv('../queries/listening_locations.csv')
# Plot results as a Seaborn barplot (2 subplots)
fig, axes = plt.subplots(1, 1)
#fig.subplots_adjust(hspace=1)
sns.set(style="whitegrid")
sns.set(rc={'figure.figsize':(15,5)})
sns.set(font_scale = 2)
plt.rcParams["xtick.labelsize"] = 16
chart1 = sns.barplot(x="location", y="total_plays", data=results, ax=axes);
chart1.set_xticklabels(chart1.get_xticklabels(), rotation=40, horizontalalignment='right');
chart1.set_title("Top 10 User Listening Locations from Sparkify dataset", weight = "bold");
chart1.set(xlabel=None)

### Check data skewness of the songs table based on its partitioning scheme

In [None]:
results = pd.read_csv('../queries/songs_skew.csv')
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results)

### Findings:
- 12,600 different partitions based on this scheme. Fairly uneven. Partitioning by just year may have been more even.
- All the year "0" entries had missing year data.

### Check data skewness of the time and songplays tables based on their identical partitioning schemes

In [None]:
results = pd.read_csv('../queries/time_skew.csv')
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(results)

### Findings:
- Only one partition for the entire dataset!!
- Both time and songplays table were partitioned by year and month