In [1]:
import pyspark
import pandas as pd
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DateType 

spark = pyspark.sql.SparkSession.builder.getOrCreate() 

from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
squared_udf = udf(lambda x: squared(x), FloatType())

In [2]:
schema = StructType([
    StructField('title', StringType(), True),
    StructField('category_name', StringType(), True),
    StructField('category_id', StringType(), True),
    StructField('followers', IntegerType(), True),
    StructField('videos', IntegerType(), True),
    StructField('country', StringType(), True),
    StructField('picture_url', StringType(), True),
    StructField('profile_url', StringType(), True),
    ]
)

In [3]:
df = spark.read.csv('youtube.csv', header='true', schema=schema)

In [4]:
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- category_name: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- followers: integer (nullable = true)
 |-- videos: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- picture_url: string (nullable = true)
 |-- profile_url: string (nullable = true)



In [5]:
df.show(5, vertical = True)

-RECORD 0-----------------------------
 title         | PewDiePie            
 category_name | Entertainment        
 category_id   | 24                   
 followers     | 69896406             
 videos        | 3649                 
 country       | United States        
 picture_url   | https://yt3.ggpht... 
 profile_url   | http://www.youtub... 
-RECORD 1-----------------------------
 title         | T-Series             
 category_name | Music                
 category_id   | 10                   
 followers     | 69471946             
 videos        | 12820                
 country       | India                
 picture_url   | https://yt3.ggpht... 
 profile_url   | http://www.youtub... 
-RECORD 2-----------------------------
 title         | Justin Bieber        
 category_name | Entertainment        
 category_id   | 24                   
 followers     | 41858494             
 videos        | 132                  
 country       | null                 
 picture_url   | https://

In [6]:
df.createOrReplaceTempView("youtube")

In [7]:
#AVG followers by Category

spark.sql('''
SELECT category_name, AVG(followers) as avg_followers 
FROM youtube
GROUP BY category_name
ORDER BY avg_followers DESC;
''').show(truncate=False)

+---------------------+------------------+
|category_name        |avg_followers     |
+---------------------+------------------+
|Comedy               |431641.4434728295 |
|Entertainment        |360062.7634681559 |
|Music                |311641.8089213382 |
|Education            |265221.8324882289 |
|Howto & Style        |243301.51093937084|
|Film and Animation   |234370.68145423068|
|News & Politics      |221840.84918918918|
|Gaming               |213334.0716175237 |
|Science & Technology |197664.78243398393|
|Sports               |188560.80100882723|
|People & Blogs       |171038.41552575864|
|Pets & Animals       |169964.69204545455|
|Autos & Vehicles     |122564.4680356451 |
|Travel & Events      |102899.29069767441|
|Nonprofits & Activism|101005.39520958083|
+---------------------+------------------+



In [9]:
# SUM Followers by Category

spark.sql('''
SELECT category_name, SUM(followers) as sum_followers 
FROM youtube
GROUP BY category_name
ORDER BY sum_followers DESC;
''').show(truncate=False)

+---------------------+-------------+
|category_name        |sum_followers|
+---------------------+-------------+
|Entertainment        |6382832608   |
|Music                |6231901253   |
|Gaming               |3306464776   |
|People & Blogs       |2423614348   |
|Howto & Style        |2235210981   |
|Education            |1464554959   |
|Comedy               |1382115902   |
|Film and Animation   |1083026919   |
|Sports               |747643576    |
|Science & Technology |688664102    |
|News & Politics      |410405571    |
|Autos & Vehicles     |316338892    |
|Pets & Animals       |149568929    |
|Travel & Events      |132740085    |
|Nonprofits & Activism|67471604     |
+---------------------+-------------+

