In [0]:
from pyspark.sql import SparkSession

spark = SparkSession\
    .builder\
    .appName('Spark Mini Project')\
    .getOrCreate()

In [0]:
spark

In [0]:
df = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/dheerajmaddi@gmail.com/customers_1mb.csv")

In [0]:
df.show(5)

In [0]:
df.printSchema()

In [0]:
from pyspark.sql.functions import to_date, col, year, month, countDistinct, count

In [0]:
df_updated_schema = df.withColumn('registration_date', to_date('registration_date', 'yyyy-MM-dd')) \
       .withColumn('is_active', col('is_active').cast('boolean'))

In [0]:
df_updated_schema.printSchema()

In [0]:
df_updated_schema.show(5)

In [0]:
df.withColumn('registration_date', to_date('registration_date', 'yyyy-MM-dd')) \
       .withColumn('is_active', col('is_active').cast('boolean')).show(5)

In [0]:
df.rdd.getNumPartitions()

In [0]:
spark.sparkContext.defaultMinPartitions

In [0]:
spark.sparkContext.defaultParallelism

In [0]:
df_updated_schema = df_updated_schema.fillna({'city': 'Unknown', 'state': 'Unknown', 'country': 'Unknown'})

In [0]:
df_updated_schema = df_updated_schema.withColumn('registration_year', year('registration_date')) \
                                     .withColumn('registration_month', month('registration_date'))

In [0]:
df_updated_schema.show(5)

In [0]:
unique_cities = df_updated_schema.select('city').distinct().collect()
display(unique_cities)

In [0]:
unique_cities

In [0]:
unique_cities[0]

In [0]:
unique_cities[0][0]

In [0]:
unique_cities_count = df_updated_schema.select(countDistinct('city')).collect()
display(unique_cities_count)

In [0]:
unique_cities_count

In [0]:
unique_cities_count[0][0]

In [0]:
unique_states_count = df_updated_schema.select(countDistinct('state')).collect()
display(unique_states_count)

In [0]:
unique_countries_count = df_updated_schema.select(countDistinct('country').alias('Total Countries')).collect()
display(unique_countries_count)

In [0]:
df_updated_schema.groupBy('city').count().orderBy(col('count').desc()).show(5)

In [0]:
df_updated_schema.groupBy('state', 'country').count().orderBy('count', ascending=False).show()

In [0]:
# Pivot Table - Count of Active and Inactive Users Per State 

df_updated_schema.groupBy('state').pivot('is_active').count().show()

In [0]:
from pyspark.sql import Window
from pyspark.sql.functions import rank, dense_rank, row_number, lit, min, max

In [0]:
window_spec = Window.partitionBy('state').orderBy(col('registration_date').desc())

df_updated_schema = df_updated_schema.withColumn('rank', rank().over(window_spec))\
                    .withColumn('dense_rank', dense_rank().over(window_spec))\
                        .withColumn('row_number', row_number().over(window_spec))

In [0]:
df_updated_schema.show(10)

In [0]:
df_recent_customers = df_updated_schema.filter(col('registration_date') >= lit('2023-07-01'))
df_recent_customers.orderBy('registration_date').show(5)
df_recent_customers.show(5)

In [0]:
df_updated_schema.count()

In [0]:
df_recent_customers.count()

In [0]:
# Oldest and newest customer per city
df_updated_schema.groupBy('city').agg(min('registration_date').alias('oldest'), max('registration_date').alias('newest')).show()

In [0]:
df_updated_schema.rdd.getNumPartitions()

In [0]:
output_path = '/FileStore/tables/processed_customers'
df_updated_schema.write.mode('overwrite').parquet(output_path)

In [0]:
df_processed_customers = spark.read.parquet('dbfs:/FileStore/tables/processed_customers')

In [0]:
df_processed_customers.printSchema()

In [0]:
print('Databricks Free Edition')