In [0]:
import pyspark.sql.functions as F
import os

STORAGE_ACCOUNT = os.getenv('STORAGE_ACCOUNT')
STORAGE_ACCOUNT_KEY = os.getenv('STORAGE_ACCOUNT_KEY')
spark.conf.set(STORAGE_ACCOUNT, STORAGE_ACCOUNT_KEY)


In [0]:
turkey_df = spark.read\
                 .option("header", "true")\
                 .option("inferSchema", "true")\
                 .csv("abfss://datasets@tfmstorageacc.dfs.core.windows.net/violence_against _women_turkey.csv")

turkey_df.limit(20).display()

_c0,Name,Date,Murder Motive,Murderer Name,Protection Request,Way Of Killing,News Source 1,News Source 2,Age of Victim,Province,Perpetrator Status,Notes
,Tuğçe Baran,29/09/2019,Not Determined,Boyfriend,No,Firearm,http://www.milliyet.com.tr/gundem/evinde-basindan-vurulan-tugce-hayatini-kaybetti-6042935,,of age,Izmir,Prisoner,suspicious Death
,Ebru Erdem,20/03/2019,Not Determined,Not Determined,Not Determined,Falling from high,http://www.milliyet.com.tr/istanbul-da-rezidansta-dehset-kan-gundem-galeri-2845198/,,of age,İstanbul,Investigation Continues,suspicious Death
,Songül Önemli,26/10/2019,Not Determined,Not Determined,No,Firearm,http://www.milliyet.com.tr/galeri/sir-dolu-olum-iki-genc-kadin-boyle-bulundu-6065264,,of age,Adiyaman,Not Determined,suspicious Death
,Cansu Güven,26/10/2019,Not Determined,Not Determined,No,Firearm,http://www.milliyet.com.tr/galeri/sir-dolu-olum-iki-genc-kadin-boyle-bulundu-6065264,,of age,Adiyaman,Not Determined,suspicious Death
,Maiko Dzidziguri,23.02.2018,Unspecified,Unknown,No,drowning,http://www.milliyet.com.tr/gurcu-kadinin-esrarengiz-olumu--gundem-2615313/,,of age,İstanbul,Unknown,suspicious Death
,Alara Karademir,29/03/2018,Discussion,Someone familiar,No,Darpa,http://m.ilerihaber.org/icerik/yogun-bakimdaki-alara-karademir-hayatini-kaybetti-83550.html,,of age,Ankara,Investigation Continues,suspicious Death
,Ganime Varsak,17/02/2018,Unknown,Unknown,No,burned,http://www.milliyet.com.tr/yanmis-cesedi-bulunan-kadinin-gundem-2611889/,,of age,Kırıkkale,Investigation,suspicious Death
,Melahat Mersin,16/06/2018,Comprise from not detected,Somebody knows,No,Not Determined,https://www.yeniasir.com.tr/yasam/2018/06/17/insallah-annem-degildir,,of age,Izmir,Prisoner,suspicious Death
,Bahar Akdemir,21/12/2019,Not Determined,Not Determined,No,Not Determined,https://www.cnnturk.com/turkiye/evinin-onunde-genc-kizin-cesedi-bulundu,,Not Rashid,Diyarbakir,Investigation Continues,suspicious Death
,Derya Tavşan,10/12/2019,Not Determined,Not Determined,No,Not Determined,http://www.hurriyet.com.tr/gundem/otel-odasinda-supheli-olum-41394166,,of age,Adana,Not Determined,suspicious Death


For this dataset we need to get the year from the date column and group perpetrators and age group


In [0]:
turkey_df = turkey_df.withColumn(
    'year',
    F.year(F.to_date(F.col('Date'), 'dd/MM/yyyy')),
)
turkey_df.select(
    'Date',
    'year',
).limit(20).display()

Date,year
29/09/2019,2019.0
20/03/2019,2019.0
26/10/2019,2019.0
26/10/2019,2019.0
23.02.2018,
29/03/2018,2018.0
17/02/2018,2018.0
16/06/2018,2018.0
21/12/2019,2019.0
10/12/2019,2019.0


there are some wrong formatted dates in the dataset. Since errors and typings are so varied its too difficult to clean them up.
We will drop them

In [0]:
turkey_df = turkey_df.where(
    F.col('year').isNotNull(),
)

now group age groups

In [0]:
turkey_df.select(
    'Age of Victim'
).distinct().limit(20).display()

Age of Victim
15
29
34
8
52
""
""
of age
31
18


In [0]:
turkey_df_grouped_age = turkey_df.withColumn(
    'age_group',
    F.when(
        F.col("Age of Victim").cast("int").isNull(),
        F.col("Age of Victim"),
    ).otherwise(
        F.when(
            F.col("Age of Victim").cast("int") < 18,
            'child'
        ).otherwise(
            'adult'
        )
    )
)
turkey_df_grouped_age.select('age_group').distinct().limit(20).display()

age_group
""
""
of age
Not Rashid
Unknown
adult
child


In [0]:
turkey_df_grouped_age = turkey_df_grouped_age.where(
    F.col('age_group').isin(['child', 'adult', 'of age', 'Not Rashid'])
).withColumn(
    'age_group',
    F.when(
        F.col("age_group") == 'of age',
        'adult',
    ).otherwise(
        F.when(
            F.col("age_group") == 'Not Rashid',
            'child',
        ).otherwise(F.col('age_group'))
    )
)
turkey_df_grouped_age.select('age_group').distinct().limit(20).display()

age_group
adult
child


In [0]:
turkey_df_grouped_age.count()

308

There is too little data now, so we will bin the age groups into just 'Any'. Now group perpetrators

In [0]:
turkey_df_aggregated = turkey_df.withColumn(
    'country', F.lit('turkiye'), # this is the new name of the country
).withColumn(
    'age_group', F.lit('any'),
).select(
    F.col('year'),
    F.col('country'),
    F.col('Province').alias('province'),
).groupBy('year', 'country', 'province').agg(
    F.count('*').alias('total_cases'),
)
turkey_df_aggregated.limit(20).display()


year,country,province,total_cases
2020,turkiye,Intellectual,8
2019,turkiye,Adana,21
2011,turkiye,Yozgat,1
2018,turkiye,Kars,4
2020,turkiye,Kirsehir,1
2019,turkiye,Kahramanmaras,4
2019,turkiye,Konya,1
2019,turkiye,Trabzon,1
2018,turkiye,Bayburt,1
2020,turkiye,Adiyaman,1


and now group for all provinces

In [0]:
turkey_df_all = turkey_df_aggregated.withColumn(
    'Province', F.lit('all'),
).select(
    F.col('year'),
    F.col('country'),
    F.col('Province').alias('province'),
    F.col('total_cases'),
).groupBy('year', 'country', 'province').agg(
    F.sum('total_cases').alias('total_cases'),
)
turkey_df_aggregated.union(turkey_df_all).limit(20).display()

year,country,province,total_cases
2020,turkiye,Intellectual,8
2019,turkiye,Adana,21
2011,turkiye,Yozgat,1
2018,turkiye,Kars,4
2020,turkiye,Kirsehir,1
2019,turkiye,Kahramanmaras,4
2019,turkiye,Konya,1
2019,turkiye,Trabzon,1
2018,turkiye,Bayburt,1
2020,turkiye,Adiyaman,1
