# Data Analytics - Datathon 2024

## Libraries

In [0]:
# Loading Libraries for the notebook
from pyspark.sql.functions import *
import datetime
from pyspark.sql.window import Window

# UDFS

In [0]:

# Function to Calculate Distance in Kilometers in a straight line between two points.

'''
CAL_LAT_LONG_DIST(df,lat1,long1,lat2,long2):
df = dataframe when we want to add the columns of "distance_in_kms"
lat1 = Latitued of Place 1
long1 = Longitude of Place 1
lat 2 = Latitude of Place 2
long2 = Longitued of Place 2

Outputs: DataFrame with a new column named = "distance_in_kms". The number represent the distance in KMS between the two points for each row.
'''

def cal_lat_log_dist(df, lat1, long1, lat2, long2):
        df = df.withColumn('distance_in_kms' , \
            round((acos((sin(radians(col(lat1))) * sin(radians(col(lat2)))) + \
                   ((cos(radians(col(lat1))) * cos(radians(col(lat2)))) * \
                    (cos(radians(long1) - radians(long2))))
                       ) * lit(6371.0)), 4))
        return df

## Loading Data

In [0]:
# Loading Data from BRONZE database

GDELT_EVENTS = spark.sql("SELECT * FROM BRONZE.GDELT_EVENTS")
PORT_LOCATIONS_DIM = spark.sql("SELECT * FROM BRONZE.PORTS_DICTIONARY")
CAMEO_DICTIONARY = spark.sql("SELECT * FROM BRONZE.CAMEO_DICTIONARY")

## Cleaning PORT_LOCATIONS_DIM

In [0]:
# Claeaning RAW data from PORT_LOCATIONS

PORT_LOCATIONS_DIM_CLEANED = (
PORT_LOCATIONS_DIM
.filter("LATITUDE IS NOT NULL") #Filter for Latitud is nos null
.filter("LONGITUDE IS NOT NULL") #Filter for Longitud is nos null
.withColumn("LATITUDE", regexp_replace(col("LATITUDE")," ","")) #Eliminate black spaces in LATITUD column
.withColumn("LONGITUDE", regexp_replace(col("LONGITUDE")," ","")) #Eliminate black spaces in LATITUD column
.withColumn("Lat_Ori", substring(col("LATITUDE"),-1,1)) # Get N,S,W,E Orientation from latitud
.withColumn("Long_Ori", substring(col("LONGITUDE"),-1,1)) # Get N,S,W,E Orientation from longitude
.withColumn("LATITUDE_CORRECTED", #THIS NEW COLUMN CORRECT THE COORINDATES DEPENDING ON THE ORIENTATION N,S,W,E
            when(col("Lat_Ori") == 'S', expr("substring(LATITUDE,1,length(LATITUDE) - 1 )") * - 1) #GET CORRECT COORDINATES
            .when(col("Lat_Ori") == 'N', expr("substring(LATITUDE,1,length(LATITUDE) - 1 )")) #GET CORRECT COORDINATES
            .when(col("Lat_Ori") == 'E', expr("substring(LATITUDE,1,length(LATITUDE) - 1 )") * -1) #GET CORRECT COORDINATES
            .otherwise(999.999) # ID FOR CHECKING IF SOME VALUE ISN'T TAKEN INTO ACCOUNT
)
.withColumn("LONGITUDE_CORRECTED", #THIS NEW COLUMN CORRECT THE COORINDATES DEPENDING ON THE ORIENTATION N,S,W,E
            when(col("Long_Ori") == 'E', expr("substring(LONGITUDE,1,length(LONGITUDE) - 1 )")) #GET CORRECT COORDINATES
            .when(col("Long_Ori") == 'W', expr("substring(LONGITUDE,1,length(LONGITUDE) - 1 )") * -1)#GET CORRECT COORDINATES
            .when(col("Lat_Ori") == 'N', expr("substring(LATITUDE,1,length(LATITUDE) - 1 )") * -1) #GET CORRECT COORDINATES
            .otherwise(999.999) # ID FOR CHECKING IF SOME VALUE ISN'T TAKEN INTO ACCOUNT
)
.select("COUNTRY","PORT","LATITUDE_CORRECTED","LONGITUDE_CORRECTED") # SELECT COUNTRIES OF INTEREST
)

## DATA ANALYSIS FOR WHOLE COUNTRYS IN THE TRANSPACIFIC ROUTE:
* CANADA
* USA
* CHINA
* JAPON
* SOUTH KOREA
* TAIWAN
* VEITNAM
* HONG KONG

#### Dataset with Countries of Interest

In [0]:
GDELT_EVENTS_TPR = (GDELT_EVENTS
.filter(col("ActionGeo_CountryCode").isin("US","CA","VM","CH","JA","HK","KS")) # FILTER FOR COUNTRYS OF INTEREST
.join(CAMEO_DICTIONARY,col("EventCode") == col("CAMEO CODE"), "left") #GET NAME FOR EventRootCode
.filter("DESCRIPTION is not null") #NO NEWS WITH NO CLEAR DESCRIPTION
.withColumn("Date", to_date(col("Day").cast("string"), "yyyyMMdd")) # CREATE COLUMN OF DATE TYPE
.filter("Date >= '2023-01-01'")
.withColumn("YearWeek", weekofyear(col("Date"))) # GET NUMBER OF WEEk OF THE YEAR
.withColumn("MonthYearWeek", concat(col("MonthYear"),col("YearWeek"))) #GET DATE ID for MONTH,YEAR,WEEK
)

# Duplicate Information

We notice that some SOURCEURL events are duplicated. This means that there could be multiple interpretation or actors involve in the same Global Event. For keeping the data "clean" as possible we manege to identify that there is no duplicate records that are exactly the same. Nevertheless, some data could affect in the "importaness" they have. For example if we have 3 news that are the same vs another one  any math we do with respect to this news would be "biased" by the different values we could have for the same "new". So we are going to measure the "importance" of each SOURCEURL by ranking how many articles are being writtend respecting to that GlobalEvent and selecting the observation within the duplicades SOURCEURL where the NumArticles is greater.

In [0]:
GDEL_EVENTS_TPR_U = (
GDELT_EVENTS_TPR
.withColumn("RankNews", row_number().over(Window.partitionBy(col("SOURCEURL")).orderBy("NumArticles")))
.filter(col("RankNews") == 1)
)

# Country Analysis 1: Political situation

The first objetive in our Country Analysis is measure the level of "goodness" in the political situation in each country. First. For this analysis we need to find an objective variable, our base to finding this objetive variable is going to be the Goldstein Scale, this scale measures the positive (1,10) or the negative (-10,-1) impact of a Global Event. Nevertheless, we can't just do a simple average of the news within a day cause not every new is as important for one country than others. So we are going to do a Weighted Average. The Weight is going to be representend with the variable "NumArticles" that represents the number of articles related to that global event. So, in this case global events with more articles would be more important than news with less articles.

Also, the crucial part for this analysis is to define what news are we going to take into account that represents Political Situaction. In this case we identify that:
* Actor1Type1Code = 'GOV' (Goverment). Actor 1 is related to Goverment
* Actor2Type1Code = 'GOV' (Goverment). Actor 2 is related to Goverment

Using China as an example we measure the weighted average of the Goldenstein Scale by Number of articles. The results shows that one of the highest values for this weighted average for positive news is in :
* date 2024-02-04 with a value of 5.12

on date 2024-02-02 the new with more weight (based on the number of articles) is : 
https://kpic.com/news/offbeat/pigeon-detained-for-8-months-on-espionage-suspicions-finally-released-china-india-mumbai-peta-animal-hospital-bombay-police-warfare-military-unit-open-water-racing-pakistan-kashmir-modi

This new basically is telling us that a pidgeon that India belive might be a espionage tactic from China was realese doing that it has nothing to do with espionage from China


The date with the highest weighted average for negative news:
* date 2023-03-23 with a value of 10

On date 2023-03-23 this new was very negative: 
https://abc3340.com/news/connect-to-congress/lawmakers-push-pentagon-to-move-faster-to-bolster-cybersecurity-improve-tech-capabilities-artificial-intelligence-autonomous-weapons-china-cybersecurity-malware-replicator

This news basically is telling us that the USA is considering a "threat" that China is moving foward with AI for military use.



In [0]:
#GDELT EVENTS POLITICAL

GDELT_EVENTS_TPR_NTI_POLITICAL = (GDEL_EVENTS_TPR_U
.filter((col("Actor1Type1Code").isin('GOV')) | (col("Actor2Type1Code").isin('GOV'))
))

GDELT_EVENTS_TPR_NTI_POLITICAL_POSITIVE = GDELT_EVENTS_TPR_NTI_POLITICAL.filter("GoldsteinScale >= 0")
GDELT_EVENTS_TPR_NTI_POLITICAL_NEGATIVE = GDELT_EVENTS_TPR_NTI_POLITICAL.filter("GoldsteinScale < 0")

In [0]:
display(
GDELT_EVENTS_TPR_NTI_POLITICAL_POSITIVE
.filter("ActionGeo_CountryCode == 'CH'")
.withColumn("PondGS", col("GoldsteinScale") * col("NumArticles"))
.groupBy("Date").agg(sum(col("PondGS")).alias("PondGS"),sum("NumArticles").alias("NumArticles"))
.withColumn("GSPonderado", col("PondGS") / col("NumArticles"))
)

In [0]:
display(GDELT_EVENTS_TPR_NTI_POLITICAL_POSITIVE
.filter("ActionGeo_CountryCode == 'CH'")
.filter("Date == '2024-02-04'")
)

In [0]:
display(
GDELT_EVENTS_TPR_NTI_POLITICAL_NEGATIVE
.withColumn("GoldsteinScale",col("GoldsteinScale") * -1)
.filter("ActionGeo_CountryCode == 'CH'")
.withColumn("PondGS", col("GoldsteinScale") * col("NumArticles"))
.groupBy("Date").agg(sum(col("PondGS")).alias("PondGS"),sum("NumArticles").alias("NumArticles"))
.withColumn("GSPonderado", col("PondGS") / col("NumArticles"))
)

In [0]:
display(GDELT_EVENTS_TPR_NTI_POLITICAL_NEGATIVE
.filter("ActionGeo_CountryCode == 'CH'")
.filter("Date == '2023-09-30'")
)

In [0]:
display(PORT_LOCATIONS_DIM_CLEANED)

In [0]:
GDELT_EVENTS_INTEREST = (
GDELT_EVENTS_DATE
.filter("ActionGeo_Lat is not null")
.filter("ActionGeo_Long is not null")
.select("ActionGeo_Fullname","ActionGeo_CountryCode","ActionGeo_Lat","ActionGeo_Long","SOURCEURL")
)

In [0]:
display(GDELT_EVENTS_INTEREST)

In [0]:
GDELT_CROSS_PORT = (GDELT_EVENTS_INTEREST.filter("ActionGeo_CountryCode == 'CH'")
.crossJoin(PORT_LOCATIONS_DIM_CLEANED.filter("COUNTRY == 'CHINA'"))
)

GDELT_CROSS_PORT_DISTANCE = cal_lat_log_dist(GDELT_CROSS_PORT, 'ActionGeo_Lat', 'ActionGeo_Long', 'LATITUDE_CORRECTED', 'LONGITUDE_CORRECTED')

display(GDELT_CROSS_PORT_DISTANCE)

## DA