In [0]:
import pyspark.sql.functions as F
import os

STORAGE_ACCOUNT = os.getenv('STORAGE_ACCOUNT')
STORAGE_ACCOUNT_KEY = os.getenv('STORAGE_ACCOUNT_KEY')
spark.conf.set(STORAGE_ACCOUNT, STORAGE_ACCOUNT_KEY)


In [0]:
pacific_df = spark.read\
                 .option("header", "true")\
                 .option("inferSchema", "true")\
                 .csv("abfss://datasets@tfmstorageacc.dfs.core.windows.net/violence_against_women_pacific.csv")

pacific_df.display()

There are a lot of columns and statistics, the first thing we need to do is select the statistics we are interested on, so lets see which are avaliable

In [0]:
pacific_df.select(
    F.col('TOPIC10'),
    F.col('Topic11'),
).distinct().display()

The topics we are interested in are 
- VAW_TOPIC_001: Types of violence against women by partner
- VAW_TOPIC_007: Types of violence against women by others (non-partners)
- VAW_TOPIC_010: Child sexual abuse prevalence by type of perpetrator
We don't want null values.

In [0]:
pacific_df = pacific_df.where(
    F.col('TOPIC10').isin('VAW_TOPIC_001', 'VAW_TOPIC_007', 'VAW_TOPIC_010') &
    F.col('OBS_VALUE').isNotNull()
).select(
    F.col('TIME_PERIOD').alias('year'),
    F.col('Pacific Island Countries and territories').alias('country'),
    F.col('Type of violence').alias('violence_type'),
    F.col('OBS_VALUE').alias('value_perc'),
    F.col('Perpetrator23').alias('perpetrator'),
).orderBy(
    F.col('year').desc(),
    F.col('country'),
)
pacific_df.display()

There are some repeated fields with just the value differing, that is due to some fields outside the selection that make it so. In order to delete those we will get the higher percentage as that is the best we can do to reduce the number of cases lost. 

In [0]:
pacific_df = pacific_df.groupBy('year', 'country', 'violence_type', 'perpetrator').agg(
    F.max(F.col('value_perc')).alias('value_perc')
)
pacific_df.display()