In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.appName("Data_Visualizations").getOrCreate()

In [3]:
file_location = "movie_data_part1.csv"
file_type = "csv"
infer_schema="False"
first_row_is_header="True"
delimeter="|"

In [4]:
df = spark.read.format(file_type)\
.option("inferSchema",infer_schema)\
.option("header",first_row_is_header)\
.option("sep",delimeter)\
.load(file_location)

In [5]:
select_columns = ['id','budget','popularity','release_date','revenue','title']

In [6]:
df=df.select(*select_columns)

In [10]:
df_with_newcols = df.select('id','budget','popularity').withColumn('budget_cat',when(df['budget']<100000000,'Small').when(df['budget']<1000000000,'Medium').otherwise('Big')).withColumn('ratings',when(df['popularity']<3,'Low').when(df['popularity']<5,'Mid').otherwise('High'))
                                                                    

In [11]:
df_with_newcols = df_with_newcols.withColumn("BudgetRating_Category", concat(df_with_newcols.budget_cat,df_with_newcols.ratings))

In [12]:
df_with_newcols=df_with_newcols.withColumn("BudgetRating_Category",trim(lower(df_with_newcols.BudgetRating_Category)))

In [13]:
df_with_newcols.show()

+-----+-------+------------------+----------+-------+---------------------+
|   id| budget|        popularity|budget_cat|ratings|BudgetRating_Category|
+-----+-------+------------------+----------+-------+---------------------+
|43000|      0|             2.503|     Small|    Low|             smalllow|
|43001|      0|              5.51|     Small|   High|            smallhigh|
|43002|      0|              5.62|     Small|   High|            smallhigh|
|43003|      0|             7.159|     Small|   High|            smallhigh|
|43004| 500000|             3.988|     Small|    Mid|             smallmid|
|43006|      0|             3.194|     Small|    Mid|             smallmid|
|43007|      0|             2.689|     Small|    Low|             smalllow|
|43008|      0|             6.537|     Small|   High|            smallhigh|
|43010|      0|             4.297|     Small|    Mid|             smallmid|
|43011|      0|             4.417|     Small|    Mid|             smallmid|
|43012|70000

# Register a Temporary table

In [15]:
df_with_newcols.createOrReplaceTempView('temp_data')

In [17]:
spark.sql('select\
                ratings,\
                count(ratings)\
            from\
                temp_data group by ratings').show()

+-------+--------------+
|ratings|count(ratings)|
+-------+--------------+
|   High|         16856|
|    Low|         14865|
|    Mid|         12277|
+-------+--------------+



# Importing window functions

In [18]:
from pyspark.sql.window import *

In [19]:
#Filtering missing values
df_with_newcols = df_with_newcols.filter((df_with_newcols['popularity'].isNotNull()) & (~isnan(df_with_newcols['popularity'])))

In [21]:
df_with_newcols = df_with_newcols.select("id","budget","popularity",ntile(10).over(Window.partitionBy().orderBy(df_with_newcols['popularity'].desc())).alias("decile_rank"))

In [22]:
df_with_newcols.groupby("decile_rank").agg(min('popularity').alias('min_popularity'),max('popularity').alias('max_popularity'),count('popularity')).show()

+-----------+------------------+--------------+-----------------+
|decile_rank|    min_popularity|max_popularity|count(popularity)|
+-----------+------------------+--------------+-----------------+
|          1|             7.402|            99|             4379|
|          2|             5.792|         7.401|             4379|
|          3|             4.792|         5.792|             4379|
|          4|             4.024|         4.792|             4378|
|          5|             3.371|         4.024|             4378|
|          6|             2.779|          3.37|             4378|
|          7|             2.108|         2.779|             4378|
|          8|            10.422|         2.108|             4378|
|          9|             1.389|         10.42|             4378|
|         10|0.6000000000000001|         1.389|             4378|
+-----------+------------------+--------------+-----------------+

