In [1]:
import os
import sys

sys.path.append("..")

import seaborn
import matplotlib.pyplot as plt
import geopandas as gpd
import pyspark.sql.functions as F
import plotly.express as px
from pyspark.sql import SparkSession

from task_3_preprocessing import get_processed_csv_path

In [2]:
DATA_DIR = "data"

In [3]:
spark = (
    SparkSession.builder.appName("Airline Twitter Sentiment Analysis")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [4]:
processed_csv_path = get_processed_csv_path(os.path.join("..", "..", DATA_DIR, "task_3"))

In [5]:
df = spark.read.csv(processed_csv_path, header=True, inferSchema=True)

In [6]:
from task_3.task_3_preprocessing import load_cities_countries_json


cities_df, countries, countries_df = load_cities_countries_json(os.path.join("..", "..", DATA_DIR), spark)

In [7]:
df.columns

['_country',
 '_unit_id',
 '_created_at',
 '_golden',
 '_id',
 '_missed',
 '_started_at',
 '_channel',
 '_trust',
 '_worker_id',
 '_ip',
 'airline_sentiment',
 'airline',
 'name',
 'negativereason_gold',
 'retweet_count',
 'text',
 'tweet_created',
 'tweet_id',
 'user_timezone',
 'negativereason1',
 'negativereason2',
 'iso3']

In [8]:
countries_df.columns

['name',
 'iso3',
 'iso2',
 'numeric_code',
 'phone_code',
 'capital',
 'currency',
 'currency_name',
 'currency_symbol',
 'tld',
 'native',
 'region',
 'region_id',
 'subregion',
 'subregion_id',
 'nationality',
 'latitude',
 'longitude',
 'emoji',
 'emojiU']

In [20]:
countries_df.filter(F.col("name") == "United States").show()

+-------------+----+----+------------+----------+----------+--------+--------------------+---------------+---+-------------+--------+---------+----------------+------------+-----------+--------+---------+-----+---------------+
|         name|iso3|iso2|numeric_code|phone_code|   capital|currency|       currency_name|currency_symbol|tld|       native|  region|region_id|       subregion|subregion_id|nationality|latitude|longitude|emoji|         emojiU|
+-------------+----+----+------------+----------+----------+--------+--------------------+---------------+---+-------------+--------+---------+----------------+------------+-----------+--------+---------+-----+---------------+
|United States| USA|  US|         840|         1|Washington|     USD|United States dollar|              $|.us|United States|Americas|        2|Northern America|           6|   American|40.32528| 19.47139| 🇺🇸|U+1F1FA U+1F1F8|
+-------------+----+----+------------+----------+----------+--------+--------------------+----

In [7]:
df = df.drop("name").join(countries_df.select("iso3", "latitude", "longitude"), on="iso3", how="left")

In [8]:
filtered_df = df.filter(F.col("airline_sentiment") == "negative").groupBy("iso3", "latitude", "longitude").agg(F.count("*").alias("count"))

In [15]:
filtered_df.show()

+----+--------+---------+-----+------------------+-------------------+
|iso3|latitude|longitude|count|          latitude|          longitude|
+----+--------+---------+-----+------------------+-------------------+
| POL|41.63317| 20.01228|   30| 41.63316999999999| 20.012279999999993|
| JAM|34.98735| 63.12891| 2200| 34.98735000000051|   63.1289099999995|
| BRA|25.61955| 56.27291| 1162|25.619549999999627|  56.27290999999971|
| CUB|33.96744|  68.9492|  150| 33.96744000000001|  68.94919999999982|
| FSM|17.06565|-61.87466| 1890|17.065650000000286| -61.87466000000111|
| ITA|35.42064| 70.92261|  320| 35.42064000000015|   70.9226100000002|
| GBR|41.68389| 19.71556| 7400| 41.68388999999188|  19.71555999999917|
| PRY|40.10754| 20.25753|  160| 40.10753999999998|  20.25752999999997|
| ARE|40.35167| 19.98028|   30|40.351670000000006| 19.980279999999997|
| BMU|24.11028| 52.73056|   60|24.110280000000024| 52.730559999999976|
| AUS|25.40328| 55.52341|13779|25.403280000006017| 55.523410000014046|
| MEX|

In [13]:
filtered_df.orderBy(F.col("count").desc()).show()

+----+------------+-----------+-----+
|iso3|    latitude|  longitude|count|
+----+------------+-----------+-----+
| USA|        38.0|      -97.0|55888|
| AUS|       -27.0|      133.0|13779|
| PHL|        13.0|      122.0| 7540|
| GBR|        54.0|       -2.0| 7400|
| MEX|        23.0|     -102.0| 6540|
| CAN|        60.0|      -95.0| 6290|
| COL|         4.0|      -72.0| 5017|
| ARG|       -34.0|      -64.0| 4577|
| JAM|       18.25|      -77.5| 2200|
| FSM|  6.91666666|     158.25| 1890|
| BRA|       -10.0|      -55.0| 1162|
| HND|        15.0|      -86.5| 1060|
| CHL|       -30.0|      -71.0|  380|
| PRI|       18.25|      -66.5|  330|
| BEL| 50.83333333|        4.0|  330|
| ITA| 42.83333333|12.83333333|  320|
| NLD|        52.5|       5.75|  310|
| MUS|-20.28333333|      57.55|  310|
| TWN|        23.5|      121.0|  180|
| PRY|       -23.0|      -58.0|  160|
+----+------------+-----------+-----+
only showing top 20 rows



In [9]:
fig = px.density_mapbox(
    filtered_df.toPandas(),
    lat="latitude",
    lon="longitude",
    z="count",
    radius=30,
    mapbox_style="open-street-map",
    zoom=1.5,
    center=dict(lat=0, lon=0),
)

In [10]:
fig.show()