# Some data checks and analysis
First import and install necessary modules

In [1]:
import configparser
from datetime import datetime
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType, StringType, TimestampType
import boto3
import pandas as pd
import gc
!pip install s3fs

Collecting s3fs
  Downloading https://files.pythonhosted.org/packages/72/5c/ec84c7ec49fde2c3b0d885ecae4504fa40fc77fef7684e9f2939c50f9b94/s3fs-0.4.0-py3-none-any.whl
Collecting boto3>=1.9.91 (from s3fs)
[?25l  Downloading https://files.pythonhosted.org/packages/d5/57/e9675a5a8d0ee586594ff19cb9a601334fbf24fa2fb29052d2a900ee5d23/boto3-1.11.9-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 5.1MB/s ta 0:00:01
[?25hCollecting botocore>=1.12.91 (from s3fs)
[?25l  Downloading https://files.pythonhosted.org/packages/64/4c/b0b0d3b6f84a05f9135051b56d3eb8708012a289c4b82ee21c8c766f47b5/botocore-1.14.9-py2.py3-none-any.whl (5.9MB)
[K    100% |████████████████████████████████| 5.9MB 4.2MB/s eta 0:00:01
[?25hCollecting fsspec>=0.6.0 (from s3fs)
[?25l  Downloading https://files.pythonhosted.org/packages/dd/1f/7028dacd3c28f34ce48130aae73a88fa5cc27b6b0e494fcf2739f7954d9d/fsspec-0.6.2-py3-none-any.whl (62kB)
[K    100% |████████████████████████████████| 71kB 18.7MB

In [2]:
config = configparser.ConfigParser()
config.read('dl.cfg')

os.environ['AWS_ACCESS_KEY_ID']=config['AWS']['AWS_ACCESS_KEY_ID']
os.environ['AWS_SECRET_ACCESS_KEY']=config['AWS']['AWS_SECRET_ACCESS_KEY']

Create Spark session and increase broadcast timeout. The last step depends on the size of the cluster / machine, which is used.

In [3]:
def create_spark_session():
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    return spark

In [4]:
spark = create_spark_session()
spark.conf.set("spark.sql.broadcastTimeout",  900)

## Analyse artist table
Let's have a look at the first rows and count the records. Apparently there are 967 records on artists availabke.

In [6]:
artistdf = spark.read.parquet("s3a://christophndde4/artist_table/")

In [7]:
gc.collect()

201

In [10]:
artistdf.limit(2).show()

+------------------+------------+-----------------+--------+---------+
|         artist_id|        name|         location|latitude|longitude|
+------------------+------------+-----------------+--------+---------+
|ARXKXQ31187FB510DF|October Tide|Stockholm, Sweden|59.33217| 18.06243|
|ARXM0CX1187B98FD56|Marc Anthony|     New York, NY|40.71455|-74.00712|
+------------------+------------+-----------------+--------+---------+



In [11]:
artistdf.count()

967

## Analyse user table
Apparently there are only 104 records, please also have at look at the readme.md concerning some restrictions of the user table.

In [5]:
userdf = spark.read.parquet("s3a://christophndde4/user_table/")

In [6]:
userdf.limit(5).show()

+-------+----------+---------+------+-----+
|user_id|first_name|last_name|gender|level|
+-------+----------+---------+------+-----+
|     88|  Mohammad|Rodriguez|     M| paid|
|     88|  Mohammad|Rodriguez|     M| free|
|     29|Jacqueline|    Lynch|     F| paid|
|     29|Jacqueline|    Lynch|     F| free|
|     36|   Matthew|    Jones|     M| paid|
+-------+----------+---------+------+-----+



In [7]:
userdf.count()

104

## Analyse time table
Apparently there ate 6820 unique timestamps in this dataframe, which were between Nov. 1st and Nov. 30th of 2018.

In [15]:
timedf = spark.read.parquet("s3a://christophndde4/time_table/")

In [9]:
timedf.limit(5).show()

+--------------------+----+---+----+-----+----+-------+
|          start_time|hour|day|week|month|year|weekday|
+--------------------+----+---+----+-----+----+-------+
|2018-11-05 18:36:...|  18|  5|  45|   11|2018|      2|
|2018-11-05 18:37:...|  18|  5|  45|   11|2018|      2|
|2018-11-05 18:41:...|  18|  5|  45|   11|2018|      2|
|2018-11-05 18:41:...|  18|  5|  45|   11|2018|      2|
|2018-11-05 18:44:...|  18|  5|  45|   11|2018|      2|
+--------------------+----+---+----+-----+----+-------+



In [10]:
timedf.count()

6820

In [17]:
timedf.agg(F.min(F.col("start_time")), F.max(F.col("start_time"))).show()

+--------------------+--------------------+
|     min(start_time)|     max(start_time)|
+--------------------+--------------------+
|2018-11-01 21:01:...|2018-11-30 19:54:...|
+--------------------+--------------------+



## Analyse song table

In [16]:
songdf = spark.read.parquet("s3a://christophndde4/song_table/")

In [18]:
songdf.orderBy("song_id").limit(5).show()

+------------------+--------------------+------------------+----+---------+
|           song_id|               title|         artist_id|year| duration|
+------------------+--------------------+------------------+----+---------+
|SOAADAD12A8C13D5B0|One Shot (Album V...|ARQTC851187B9B03AF|2005|263.99302|
|SOABCEU12A8C132027|          Cold Waste|ARL6NP61187B98C1FC|2007|385.43628|
|SOABNPC12A8C13A9CC|       Après Le Show|ARFM1EQ1187FB533ED|2005| 223.4771|
|SOABWAP12A8C13F82A|           Take Time|AR5LMPY1187FB573FE|1978|258.89914|
|SOABYIT12AB0183026|        Vilda vindar|AR98ZSW1187B98E82C|1985|266.13506|
+------------------+--------------------+------------------+----+---------+



In [19]:
songdf.count()

NameError: name 'song_df' is not defined