In [None]:
import pandas as pd
import pycountry
import os
import glob
from geopy.geocoders import Nominatim
import time
import re
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [None]:
pkl_folder = "RUSSIA/PKL/"
geolocator = Nominatim(user_agent="geo_lookup")
checked_locations = {}
uk_region_mapping = {
    " England": "England",
    " Alba / Scotland": "Scotland",
    " Cymru / Wales": "Wales",
    " Northern Ireland / Tuaisceart Éireann": "Northern Ireland"
}

In [None]:
def uk_splitter(place_full_name):
        try:
            place = place_full_name.split(',')[0]
            country = place_full_name.split(',')[1]
            if country == ' United Kingdom':
                if place == 'Isle of Man':
                    return 'IM'
                if (place == 'England' or place == 'Scotland' or place == 'Wales' 
                    or place == 'Northern Ireland' or place == 'Bailiwick of Jersey' 
                    or place == 'Bailiwick of Guernsey'):
                    return place
                else:
                    return 'NOT SPECIFIED'
            
            if country == ' Isle of Man':
                    return 'IM' 
            elif (country == ' England' or country == ' Scotland' or country == ' Wales' 
                  or country == ' Northern Ireland' or country == ' Bailiwick of Jersey' 
                  or country == ' Bailiwick of Guernsey'): 
                return country.lstrip()
            elif (country == ' London' or country == ' East' and country == ' South East'):
                return 'England'
        except:
            if place_full_name in checked_locations:
                return checked_locations[place_full_name]
            attempt = 1
            location = ''
            while location == '':
                try:
                    location = geolocator.geocode(place_full_name + ', United Kingdom', timeout=2)
                except:
                    attempt +=1
                    time.sleep(2)
            try:
                address = location.address
                split_address = address.split(',')
                filtered_address = [s for s in split_address if not re.search(r'\d', s)]
                country = filtered_address[-2]
                country = uk_region_mapping.get(country, country)
                checked_locations[place_full_name] = country
                return country
            
            except:
                checked_locations[place_full_name] = 'NOT SPECIFIED'
                return 'NOT SPECIFIED'

def code_to_country(code):
    if code == "XK":
        return "Kosovo"
    try:
        return pycountry.countries.get(alpha_2=code).name
    except: 
        return code

pkl_files = glob.glob(os.path.join(pkl_folder, "*.pkl.gz"))

data_dict = {}

for pkl_file in pkl_files:
    filename = os.path.basename(pkl_file)
    date_part = filename.split("-")[-1].split(".")[0]

    df = pd.read_pickle(pkl_file, compression='gzip')

    countries_df = df[df['place_country'].notnull()].copy()

    mask = countries_df['place_country'] == 'GB'
    
    countries_df.loc[mask, 'place_country'] = countries_df.loc[mask, 'place_full_name'].apply(uk_splitter)

    country_counts = countries_df['place_country'].value_counts()
    
    country_counts.index = country_counts.index.map(code_to_country)

    data_dict[date_part] = country_counts
    
    print(pkl_file + ' DONE')

final_df = pd.DataFrame(data_dict).fillna(0).astype(int)





In [None]:
DF = final_df

In [None]:
DF["Total"] = DF.sum(axis=1)

DF = DF.sort_values(by="Total", ascending=False)

DF.index.name = "place_country"

DF = DF[DF.index != 'NOT SPECIFIED']




In [None]:
countries_to_keep = ["Argentina", "Australia", "Belgium", "Brazil",  
    "Colombia", "Costa Rica", "Croatia", "Denmark", "England","Egypt", "France", 
    "Germany", "Iceland", "Iran, Islamic Republic of", "Japan", "Korea, Republic of", "Mexico", 
    "Morocco", "Nigeria", "Panama", "Peru", "Poland", "Portugal", "Russian Federation", "Saudi Arabia", 
    "Senegal", "Serbia", "Spain", "Sweden", "Switzerland", "Tunisia", "Uruguay"]

DF32 = DF[DF.index.isin(countries_to_keep)].reset_index()

In [None]:

england_data = DF32[DF32["place_country"] == "England"].iloc[:, 1:-1] 
dates = pd.to_datetime(england_data.columns, format='%Y%m%d')

tweet_counts = england_data.values.flatten()

plt.figure(figsize=(12, 6))
plt.plot(dates, tweet_counts, marker='o', linestyle='-', label="England Tweet Count", color='red')

plt.xlabel("Date")
plt.ylabel("Number of Tweets")
plt.title("Tweet Progression for England during World Cup 2018")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()

plt.show()

In [None]:
brazil_data = DF32[DF32["place_country"] == "Brazil"].iloc[:, 1:-1]
dates = pd.to_datetime(brazil_data.columns, format='%Y%m%d')

tweet_counts = brazil_data.values.flatten()

plt.figure(figsize=(12, 6))
plt.plot(dates, tweet_counts, marker='o', linestyle='-', label="Brazil Tweet Count", color='green')

plt.xlabel("Date")
plt.ylabel("Number of Tweets")
plt.title("Tweet Progression for Brazil during World Cup 2018")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()

plt.show()

In [None]:
argentina_matches = {
    'Argentina vs Iceland': '2018-06-16',
    'Argentina vs Croatia': '2018-06-21',
    'Argentina vs Nigeria': '2018-06-26',
    'Argentina vs France': '2018-06-30'
}

argentina_data = DF32[DF32["place_country"] == "Argentina"].iloc[:, 1:-1]
dates = pd.to_datetime(argentina_data.columns, format='%Y%m%d')

tweet_counts = argentina_data.values.flatten()

plt.figure(figsize=(12, 6))
plt.plot(dates, tweet_counts, marker='o', linestyle='-', label="Argentina Tweet Count", color='blue')

plt.xlabel("Date")
plt.ylabel("Number of Tweets")
plt.title("Tweet Progression for Argentina during World Cup 2018")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()

for label, date_str in argentina_matches.items():
    date = pd.to_datetime(date_str)
    plt.axvline(date, color='gray', linestyle='--', alpha=0.7)
    plt.text(date, plt.ylim()[1] * 0.95, label, rotation=90, verticalalignment='top', fontsize=8)

plt.show()

In [None]:
france_matches = {
    'France vs Australia': '2018-06-16',
    'France vs Peru': '2018-06-21',
    'France vs Denmark': '2018-06-26',
    'France vs Argentina': '2018-06-30',
    'France vs Uruguay': '2018-07-06',
    'France vs Belgium': '2018-07-10',
    'France vs Croatia': '2018-07-15'
}

france_data = DF32[DF32["place_country"] == "France"].iloc[:, 1:-1]
dates = pd.to_datetime(france_data.columns, format='%Y%m%d')

tweet_counts = france_data.values.flatten()

plt.figure(figsize=(12, 6))
plt.plot(dates, tweet_counts, marker='o', linestyle='-', label="France Tweet Count", color='navy')

plt.xlabel("Date")
plt.ylabel("Number of Tweets")
plt.title("Tweet Progression for France during World Cup 2018")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()

for label, date_str in france_matches.items():
    date = pd.to_datetime(date_str)
    plt.axvline(date, color='gray', linestyle='--', alpha=0.7)
    plt.text(date, plt.ylim()[1] * 0.95, label, rotation=90, verticalalignment='top', fontsize=8)


plt.show()

In [None]:
croatia_data = DF32[DF32["place_country"] == "Croatia"].iloc[:, 1:-1]
dates = pd.to_datetime(croatia_data.columns, format='%Y%m%d')

tweet_counts = croatia_data.values.flatten()

plt.figure(figsize=(12, 6))
plt.plot(dates, tweet_counts, marker='o', linestyle='-', label="Croatia Tweet Count", color='navy')

plt.xlabel("Date")
plt.ylabel("Number of Tweets")
plt.title("Tweet Progression for Croatia during World Cup 2018")
plt.xticks(rotation=45)
plt.grid(True)
plt.legend()

plt.show()

In [None]:

plt.figure(figsize=(12, 6))
dates = pd.to_datetime(england_data.columns, format='%Y%m%d')

tweet_counts_england = england_data.values.flatten()
total_england = tweet_counts_england.sum()
percentage_england = (tweet_counts_england / total_england) * 100
plt.plot(dates, percentage_england, marker='o', linestyle='-', label="England % of Total Tweets", color='red')

tweet_counts_brazil = brazil_data.values.flatten()
total_brazil = tweet_counts_brazil.sum()
percentage_brazil = (tweet_counts_brazil / total_brazil) * 100
plt.plot(dates, percentage_brazil, marker='o', linestyle='-', label="Brazil % of Total Tweets", color='green')

tweet_counts_argentina = argentina_data.values.flatten()
total_argentina = tweet_counts_argentina.sum()
percentage_argentina = (tweet_counts_argentina / total_argentina) * 100
plt.plot(dates, percentage_argentina, marker='o', linestyle='-', label="Argentina % of Total Tweets", color='blue')

tweet_counts_france = france_data.values.flatten()
total_france = tweet_counts_france.sum()
percentage_france = (tweet_counts_france / total_france) * 100
plt.plot(dates, percentage_france, marker='o', linestyle='-', label="France % of Total Tweets", color='navy')

tweet_counts_croatia = croatia_data.values.flatten()
total_croatia = tweet_counts_croatia.sum()
percentage_croatia = (tweet_counts_croatia / total_croatia) * 100
plt.plot(dates, percentage_croatia, marker='o', linestyle='-', label="Croatia % of Total Tweets", color='black')

ax = plt.gca()
ax.xaxis.set_major_locator(mdates.DayLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d'))

plt.xticks(rotation=45)
plt.xlabel("Date")
plt.ylabel("Percentage of Total Tweets (%)")
plt.title("Daily Tweet Percentage during World Cup 2018")

plt.grid(True, which='both', axis='x', linestyle='--', alpha=0.7)
plt.legend()
plt.tight_layout()
plt.show()
