# Songs Analysis

In [0]:
# prelude
from pyspark.sql import functions as F
from pyspark.sql.functions import avg, col, desc

In [0]:
file_type = 'parquet'

songs = spark.read.parquet("s3://full-stack-bigdata-datasets/Big_Data/YOUTUBE/items_selected.parquet")
songs.printSchema()
songs.count(), len(songs.columns)

root
 |-- contentDetails_duration: string (nullable = true)
 |-- id: string (nullable = true)
 |-- snippet_channelId: string (nullable = true)
 |-- snippet_channelTitle: string (nullable = true)
 |-- snippet_publishedAt: string (nullable = true)
 |-- snippet_title: string (nullable = true)
 |-- statistics_commentCount: double (nullable = true)
 |-- statistics_dislikeCount: double (nullable = true)
 |-- statistics_viewCount: long (nullable = true)

Out[12]: (3907, 9)

1. Use `.describe()` on the DataFrame

In [0]:
songs.describe().toPandas()

Unnamed: 0,summary,contentDetails_duration,id,snippet_channelId,snippet_channelTitle,snippet_publishedAt,snippet_title,statistics_commentCount,statistics_dislikeCount,statistics_viewCount
0,count,3907,3907,3907,3907,3907,3907,3880.0,3893.0,3907.0
1,mean,,,,7.296272962E9,,,6134.878865979382,4669.332391471872,20763860.959815715
2,stddev,,,,0.0,,,38535.66393506117,34768.35562984628,122956651.30505532
3,min,PT10M,--RRkree8kM,UC--yIemFNSgwQ0JxyYsABAQ,!!! (Chk Chk Chk),2005-11-02T22:02:48Z,!!! - One Girl/One Boy,0.0,0.0,78.0
4,max,PT9M7S,zzrNI66FUWE,UCzz_ERveHOdZve_Qsthwhnw,さむい者おげし,2019-11-04T14:55:48Z,📺 Danakil - Le rêve [Official Video],1057699.0,1373532.0,3058853981.0


2. Count the number of missing values for each column

*NOTE: Print out the results as a pandas DataFrame*

In [0]:
def count_missing(col_name):
  return F.sum(F.col(col_name).isNull().cast('int')).alias(col_name)

missing_values = songs.select(*[count_missing(c) for c in songs.columns]).toPandas()
missing_values

Unnamed: 0,contentDetails_duration,id,snippet_channelId,snippet_channelTitle,snippet_publishedAt,snippet_title,statistics_commentCount,statistics_dislikeCount,statistics_viewCount
0,0,0,0,0,0,0,27,14,0


Curiously, we have a few songs with missing `viewCounts`.

3. What are the 5 most popular songs? (by view count)

In [0]:
songs.select('snippet_channelTitle', 'snippet_title', 'statistics_viewCount').orderBy(F.desc('statistics_viewCount')).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,snippet_title,statistics_viewCount
0,OneRepublicVEVO,OneRepublic - Counting Stars (Official Music V...,3058853981
1,Passenger,Passenger | Let Her Go (Official Video),2789082191
2,EminemVEVO,Eminem - Love The Way You Lie ft. Rihanna,2064352955
3,AviciiOfficialVEVO,Avicii - Wake Me Up (Official Video),1963215194
4,gotyemusic,Gotye - Somebody That I Used To Know (feat. Ki...,1506602164


In [0]:
)

Unnamed: 0,snippet_channelTitle,snippet_title,statistics_viewCount
0,OneRepublicVEVO,OneRepublic - Counting Stars (Official Music V...,3058853981
1,Passenger,Passenger | Let Her Go (Official Video),2789082191
2,EminemVEVO,Eminem - Love The Way You Lie ft. Rihanna,2064352955
3,AviciiOfficialVEVO,Avicii - Wake Me Up (Official Video),1963215194
4,gotyemusic,Gotye - Somebody That I Used To Know (feat. Ki...,1506602164


4. Compute:
- total_viewCount: the total viewcount per a channelTitle
- mean_viewCount: the average viewcount per a channel
- max_viewCount: the max view count per a channel
- min_viewCount: the min view count per a channel
- std_viewCount: the standard deviation of view counts per channel
- songsCount: number of songs associated per channel on our list

In [0]:
songs.groupBy('snippet_channelTitle').agg(F.sum("statistics_viewCount")).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,sum(statistics_viewCount)
0,"Being part of something special, makes you spe...",67464186
1,Grimes,66954706
2,Sargent House,123884
3,WildBeastsVEVO,581479
4,ParetoPark,3083


In [0]:
songs.groupBy('snippet_channelTitle').agg(F.sum("statistics_viewCount"), F.mean("statistics_viewCount")).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,sum(statistics_viewCount),avg(statistics_viewCount)
0,"Being part of something special, makes you spe...",67464186,16866046.5
1,Grimes,66954706,33477353.0
2,Sargent House,123884,123884.0
3,WildBeastsVEVO,581479,581479.0
4,ParetoPark,3083,3083.0


In [0]:
songs.groupBy('snippet_channelTitle').agg(F.sum("statistics_viewCount").alias('Bob'), F.mean("statistics_viewCount"), F.max("statistics_viewCount"), F.min("statistics_viewCount"), F.stddev("statistics_viewCount"), F.count("statistics_viewCount")).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,Bob,avg(statistics_viewCount),max(statistics_viewCount),min(statistics_viewCount),stddev_samp(statistics_viewCount),count(statistics_viewCount)
0,"Being part of something special, makes you spe...",67464186,16866046.5,27567026,13069025,7139327.0,4
1,Grimes,66954706,33477353.0,64988035,1966671,44562830.0,2
2,Sargent House,123884,123884.0,123884,123884,,1
3,WildBeastsVEVO,581479,581479.0,581479,581479,,1
4,ParetoPark,3083,3083.0,3083,3083,,1


In [0]:
songs.groupBy('snippet_channelTitle').agg(F.sum("statistics_viewCount").alias('Bob'), F.mean("statistics_viewCount"), F.max("statistics_viewCount"), F.min("statistics_viewCount"), F.stddev("statistics_viewCount"), F.count("statistics_viewCount")).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,Bob,avg(statistics_viewCount),max(statistics_viewCount),min(statistics_viewCount),stddev_samp(statistics_viewCount),count(statistics_viewCount)
0,"Being part of something special, makes you spe...",67464186,16866046.5,27567026,13069025,7139327.0,4
1,Grimes,66954706,33477353.0,64988035,1966671,44562830.0,2
2,Sargent House,123884,123884.0,123884,123884,,1
3,WildBeastsVEVO,581479,581479.0,581479,581479,,1
4,ParetoPark,3083,3083.0,3083,3083,,1


5. What are the top 5 channels by `mean_viewCount`?

In [0]:
songs.groupBy('snippet_channelTitle').agg(F.sum("statistics_viewCount"), F.mean("statistics_viewCount").alias('Bob'), F.max("statistics_viewCount"), F.min("statistics_viewCount"), F.stddev("statistics_viewCount"), F.count("statistics_viewCount")).orderBy(F.desc('Bob')).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,sum(statistics_viewCount),Bob,max(statistics_viewCount),min(statistics_viewCount),stddev_samp(statistics_viewCount),count(statistics_viewCount)
0,OneRepublicVEVO,3058853981,3058854000.0,3058853981,3058853981,,1
1,Passenger,2789082191,2789082000.0,2789082191,2789082191,,1
2,gotyemusic,1506602164,1506602000.0,1506602164,1506602164,,1
3,GunsNRosesVEVO,1436168604,1436169000.0,1436168604,1436168604,,1
4,PitbullVEVO,1231470918,1231471000.0,1231470918,1231470918,,1


Unnamed: 0,snippet_channelTitle,total_viewCount,mean_viewCount,min_viewCount,max_viewCount,stdViewCount,songsCount
0,OneRepublicVEVO,3058853981,3058854000.0,3058853981,3058853981,,1
1,Passenger,2789082191,2789082000.0,2789082191,2789082191,,1
2,gotyemusic,1506602164,1506602000.0,1506602164,1506602164,,1
3,GunsNRosesVEVO,1436168604,1436169000.0,1436168604,1436168604,,1
4,PitbullVEVO,1231470918,1231471000.0,1231470918,1231470918,,1


6. What are the top 5 channels by `max_viewCount`?

In [0]:
songs.groupBy('snippet_channelTitle').agg(F.sum("statistics_viewCount"), F.mean("statistics_viewCount"), F.max("statistics_viewCount").alias('Bob'), F.min("statistics_viewCount"), F.stddev("statistics_viewCount"), F.count("statistics_viewCount")).orderBy(F.desc('Bob')).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,sum(statistics_viewCount),avg(statistics_viewCount),Bob,min(statistics_viewCount),stddev_samp(statistics_viewCount),count(statistics_viewCount)
0,OneRepublicVEVO,3058853981,3058854000.0,3058853981,3058853981,,1
1,Passenger,2789082191,2789082000.0,2789082191,2789082191,,1
2,EminemVEVO,4181548740,836309700.0,2064352955,35244992,803242800.0,5
3,AviciiOfficialVEVO,2881230160,720307500.0,1963215194,73376915,847749200.0,4
4,gotyemusic,1506602164,1506602000.0,1506602164,1506602164,,1


Unnamed: 0,snippet_channelTitle,total_viewCount,mean_viewCount,min_viewCount,max_viewCount,stdViewCount,songsCount
0,OneRepublicVEVO,3058853981,3058854000.0,3058853981,3058853981,,1
1,Passenger,2789082191,2789082000.0,2789082191,2789082191,,1
2,EminemVEVO,4181548740,836309700.0,35244992,2064352955,803242800.0,5
3,AviciiOfficialVEVO,2881230160,720307500.0,73376915,1963215194,847749200.0,4
4,gotyemusic,1506602164,1506602000.0,1506602164,1506602164,,1


7. What are the top 5 channels by `total_viewCount`?

In [0]:
songs.groupBy('snippet_channelTitle').agg(F.sum("statistics_viewCount").alias('Bob'), F.mean("statistics_viewCount"), F.max("statistics_viewCount"), F.min("statistics_viewCount"), F.stddev("statistics_viewCount"), F.count("statistics_viewCount")).orderBy(F.desc('Bob')).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,Bob,avg(statistics_viewCount),max(statistics_viewCount),min(statistics_viewCount),stddev_samp(statistics_viewCount),count(statistics_viewCount)
0,EminemVEVO,4181548740,836309700.0,2064352955,35244992,803242800.0,5
1,OneRepublicVEVO,3058853981,3058854000.0,3058853981,3058853981,,1
2,AviciiOfficialVEVO,2881230160,720307500.0,1963215194,73376915,847749200.0,4
3,Passenger,2789082191,2789082000.0,2789082191,2789082191,,1
4,Macklemore LLC,2541772489,847257500.0,1477980957,252792552,613398500.0,3


Unnamed: 0,snippet_channelTitle,total_viewCount,mean_viewCount,min_viewCount,max_viewCount,stdViewCount,songsCount
0,EminemVEVO,4181548740,836309700.0,35244992,2064352955,803242800.0,5
1,OneRepublicVEVO,3058853981,3058854000.0,3058853981,3058853981,,1
2,AviciiOfficialVEVO,2881230160,720307500.0,73376915,1963215194,847749200.0,4
3,Passenger,2789082191,2789082000.0,2789082191,2789082191,,1
4,Macklemore LLC,2541772489,847257500.0,252792552,1477980957,613398500.0,3


8. What are the top 5 channels by number of songs on our service?

In [0]:
songs.groupBy('snippet_channelTitle').agg(F.sum("statistics_viewCount"), F.mean("statistics_viewCount"), F.max("statistics_viewCount"), F.min("statistics_viewCount"), F.stddev("statistics_viewCount"), F.count("statistics_viewCount").alias('Bob')).orderBy(F.desc('Bob')).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,sum(statistics_viewCount),avg(statistics_viewCount),max(statistics_viewCount),min(statistics_viewCount),stddev_samp(statistics_viewCount),Bob
0,TheSoundYouNeed,1713339950,42833500.0,512730266,481371,109002300.0,40
1,Majestic Casual,271966905,7770483.0,54628130,329645,14007840.0,35
2,Spinnin' Records,2256309801,125350500.0,1425286294,182933,345579600.0,18
3,MrSuicideSheep,149500698,8305594.0,96501569,539727,22397030.0,18
4,FAUVE,66064873,3886169.0,12990703,223988,4195480.0,17


Unnamed: 0,snippet_channelTitle,total_viewCount,mean_viewCount,min_viewCount,max_viewCount,stdViewCount,songsCount
0,TheSoundYouNeed,1713339950,42833500.0,481371,512730266,109002300.0,40
1,Majestic Casual,271966905,7770483.0,329645,54628130,14007840.0,35
2,Spinnin' Records,2256309801,125350500.0,182933,1425286294,345579600.0,18
3,MrSuicideSheep,149500698,8305594.0,539727,96501569,22397030.0,18
4,FAUVE,66064873,3886169.0,223988,12990703,4195480.0,17


9. Scatter plot log of `meanViewCount` vs log of `trackCount`

In [0]:
songs.groupBy('snippet_channelTitle').agg(F.log(F.mean("statistics_viewCount")), F.log(F.count("statistics_viewCount"))).limit(5).toPandas()

Unnamed: 0,snippet_channelTitle,ln(avg(statistics_viewCount)),ln(count(statistics_viewCount))
0,"Being part of something special, makes you spe...",16.640813,1.386294
1,Grimes,17.32638,0.693147
2,Sargent House,11.727101,0.0
3,WildBeastsVEVO,13.27333,0.0
4,ParetoPark,8.033658,0.0


In [0]:
df_tmp = songs.groupBy('snippet_channelTitle').agg(F.log(F.mean("statistics_viewCount")), F.log(F.count("statistics_viewCount"))).limit(20)
df_tmp = df_tmp.drop("snippet_channelTitle")
display(df_tmp)

ln(avg(statistics_viewCount)),ln(count(statistics_viewCount))
16.640813076138894,1.3862943611198906
17.32637973833333,0.6931471805599453
11.727100922878089,0.0
13.27333013678918,0.0
8.03365842788615,0.0
16.002462889304873,0.0
10.482730882413174,1.3862943611198906
7.428333194190806,0.0
7.247080584585756,0.0
18.212742405255103,0.0


Databricks visualization. Run in Databricks to view.

ln(mean_viewCount),ln(songsCount)
16.640813076138894,1.3862943611198906
17.32637973833333,0.6931471805599453
11.727100922878089,0.0
13.27333013678918,0.0
8.03365842788615,0.0
16.002462889304873,0.0
10.482730882413174,1.3862943611198906
7.428333194190806,0.0
7.247080584585756,0.0
18.212742405255103,0.0
