# Data Analytics - Datathon 2024

## Libraries

In [0]:
from pyspark.sql.functions import *
import datetime

# UDFS

In [0]:
def cal_lat_log_dist(df, lat1, long1, lat2, long2):
        df = df.withColumn('distance_in_kms' , \
            round((acos((sin(radians(col(lat1))) * sin(radians(col(lat2)))) + \
                   ((cos(radians(col(lat1))) * cos(radians(col(lat2)))) * \
                    (cos(radians(long1) - radians(long2))))
                       ) * lit(6371.0)), 4))
        return df

## Loading Data

In [0]:
GDELT_EVENTS = spark.sql("SELECT * FROM BRONZE.GDELT_EVENTS")
PORT_LOCATIONS_DIM = spark.sql("SELECT * FROM BRONZE.PORTS_DICTIONARY")
CAMEO_DICTIONARY = spark.sql("SELECT * FROM BRONZE.CAMEO_DICTIONARY")

## Cleaning PORT_LOCATIONS_DIM

In [0]:
PORT_LOCATIONS_DIM_CLEANED = (
PORT_LOCATIONS_DIM
.filter("LATITUDE IS NOT NULL")
.filter("LONGITUDE IS NOT NULL")
.withColumn("LATITUDE", regexp_replace(col("LATITUDE")," ",""))
.withColumn("LONGITUDE", regexp_replace(col("LONGITUDE")," ",""))
.withColumn("Lat_Ori", substring(col("LATITUDE"),-1,1))
.withColumn("Long_Ori", substring(col("LONGITUDE"),-1,1))
.withColumn("LATITUDE_CORRECTED", 
            when(col("Lat_Ori") == 'S', expr("substring(LATITUDE,1,length(LATITUDE) - 1 )") * - 1)
            .when(col("Lat_Ori") == 'N', expr("substring(LATITUDE,1,length(LATITUDE) - 1 )"))
            .when(col("Lat_Ori") == 'E', expr("substring(LATITUDE,1,length(LATITUDE) - 1 )") * -1)
            .otherwise(999.999) # ID FOR CHECKING
)
.withColumn("LONGITUDE_CORRECTED", 
            when(col("Long_Ori") == 'E', expr("substring(LONGITUDE,1,length(LONGITUDE) - 1 )"))
            .when(col("Long_Ori") == 'W', expr("substring(LONGITUDE,1,length(LONGITUDE) - 1 )") * -1)
            .when(col("Lat_Ori") == 'N', expr("substring(LATITUDE,1,length(LATITUDE) - 1 )") * -1)
            .otherwise(999.999) # ID FOR CHECKING
)
#.select("COUNTRY","PORT","LATITUDE_CORRECTED","LONGITUDE_CORRECTED")
)

# 

## DATA ANALYSIS FOR WHOLE COUNTRYS IN THE TRANSPACIFIC ROUTE:
* CANADA
* USA
* CHINA
* JAPON
* SOUTH KOREA
* TAIWAN
* VEITNAM
* HONG KONG

#### Dataset with Countries of Interest

In [0]:
GDELT_EVENTS_TPR = (GDELT_EVENTS_DATE
.filter(col("ActionGeo_CountryCode").isin("US","CA","VM","CH","JA","HK","KS"))
)

display(GDELT_EVENTS_TPR.groupBy("ActionGeo_CountryCode").count())

Databricks visualization. Run in Databricks to view.

# Analsysis of GoldensteinScale depending type of event

In [0]:
display(GDELT_EVENTS_TPR.groupBy(col("EventRootCode")).agg(avg(col("GoldsteinScale"))))

### Data Agrupation By dates, Daily, Weekly, Monthly

In [0]:
display(GDELT_EVENTS_TPR)

In [0]:
display(
GDELT_EVENTS_TPR
.withColumn("GoldsteinScaleLabel", when(col("GoldsteinScale") ))
#.groupby("Date","ActionGeo_CountryCode").agg(avg("GoldsteinScale").alias("GoldsteinScale"))
)

### Dates

In [0]:
GDELT_EVENTS_DATE = (GDELT_EVENTS
.withColumn("Date", to_date(col("Day").cast("string"), "yyyyMMdd"))
.filter(col("Date")>= '2023-01-01')
.withColumn("YearWeek", weekofyear(col("Date")))
.withColumn("Date_Week", concat(col("Date"),lit("_"),col("YearWeek")))
)

MAX_AND_MIN_DATE = GDELT_EVENTS_DATE.select(max("Date"),min("Date"))
MAX_AND_MIN_DATE.show()

In [0]:
display(PORT_LOCATIONS_DIM_CLEANED)

In [0]:
GDELT_EVENTS_INTEREST = (
GDELT_EVENTS_DATE
.filter("ActionGeo_Lat is not null")
.filter("ActionGeo_Long is not null")
.select("ActionGeo_Fullname","ActionGeo_CountryCode","ActionGeo_Lat","ActionGeo_Long","SOURCEURL")
)

In [0]:
display(GDELT_EVENTS_INTEREST)

In [0]:
GDELT_CROSS_PORT = (GDELT_EVENTS_INTEREST.filter("ActionGeo_CountryCode == 'CH'")
.crossJoin(PORT_LOCATIONS_DIM_CLEANED.filter("COUNTRY == 'CHINA'"))
)

GDELT_CROSS_PORT_DISTANCE = cal_lat_log_dist(GDELT_CROSS_PORT, 'ActionGeo_Lat', 'ActionGeo_Long', 'LATITUDE_CORRECTED', 'LONGITUDE_CORRECTED')

display(GDELT_CROSS_PORT_DISTANCE)

## DA

In [0]:
GDELT_EVENTS_BASE =

In [0]:
GDELT_EVENTS_DATE.select("GlobalEventID").distinct().count()

In [0]:
display(GDELT_EVENTS_DATE)

In [0]:
GDELT_EVENTS_PORTS = (GDELT_EVENTS_DATE
#.filter(col("ActionGeo_Fullname").like("% Port %"))
#.filter(col("SOURCEURL").like("%-port-%"))
#)

#print("Number of observations with the name a location with name port in it: ",GDELT_EVENTS_PORTS.count())

In [0]:
PLACE_CUADCLASSCOUNT = (GDELT_EVENTS_DATE
.filter(col("ActionGeo_Fullname").like("% Port %"))
.groupby("ActionGeo_ADM1Code","Date_Week").pivot("QuadClass").count()
)

GLDSCALE = GDELT_EVENTS_DATE.groupby("ActionGeo_ADM1Code","Date_Week").agg(avg("GoldsteinScale").alias("GoldsteinScale"))

ADDEDE = PLACE_CUADCLASSCOUNT.join(GLDSCALE,["ActionGeo_ADM1Code","Date_Week"],"left").fillna(0)

In [0]:
max_value = greatest(*[col(c) for c in ADDEDE.select("1","2","3","4").columns])

df_with_max_column = ADDEDE.withColumn(
    "Max_QuadClass",
      when(col("1") == max_value, "1")
     .when(col("2") == max_value, "2")
     .when(col("3") == max_value, "3")
     .when(col("4") == max_value, "4")
)

Databricks visualization. Run in Databricks to view.

In [0]:
display(df_with_max_column)

Databricks visualization. Run in Databricks to view.