In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import unicodedata
import json

# ============================
# 1. LOAD RAW DATA
# ============================
# Adjust paths to your docker/airflow layout
staging = Path("../data/staging")


cou = pd.read_csv(staging /"Olympics_Country.csv")
nd = pd.read_csv(staging /"02_normalize_date.csv")
print(len(cou))
print(len(nd['Country'].drop_duplicates()))

235
231


In [3]:
in_countries = nd[nd['Country'].str.lower().isin(cou['country'].str.lower())]
len(in_countries['Country'].drop_duplicates())


186

In [4]:
not_in_countries = nd[
    ~nd['Country'].str.lower().isin(cou['country'].str.lower())
]
len(not_in_countries['Country'].drop_duplicates())

45

In [6]:
print(not_in_countries['Country'].drop_duplicates())

2                                                   Gambia
9                                      Yemen Arab Republic
10                                           Guinea-Bissau
15                                           Côte d’Ivoire
21                        Bolivia (Plurinational State of)
28                                                 Réunion
32                                                 Tokelau
35                                                   China
61                                United States of America
63                              Iran (Islamic Republic of)
104                     Venezuela (Bolivarian Republic of)
126      United Kingdom of Great Britain and Northern I...
132                             Taiwan (Province of China)
134                                               Viet Nam
185         China, Hong Kong Special Administrative Region
256                                         Azores Islands
352                                       French Polynes

In [7]:
import re
import unicodedata
import pandas as pd

def normalize_country(name):
    if pd.isna(name):
        return None

    # Lowercase
    name = name.lower()

    # Remove accents
    name = unicodedata.normalize('NFKD', name)
    name = ''.join(c for c in name if not unicodedata.combining(c))

    # Replace separators with spaces
    name = re.sub(r'[-_/]', ' ', name)

    # Remove other punctuation
    name = re.sub(r'[^\w\s]', '', name)

    # Normalize whitespace
    name = re.sub(r'\s+', ' ', name).strip()

    return name


In [8]:
nd['country_norm'] = nd['Country'].apply(normalize_country)
cou['country_norm'] = cou['country'].apply(normalize_country)

not_in = nd[~nd['country_norm'].isin(cou['country_norm'])]
print(not_in['Country'].drop_duplicates())

28                          Réunion
32                          Tokelau
256                  Azores Islands
352                French Polynesia
630                      Guadeloupe
835                      Montserrat
836                      Martinique
1195                           Niue
1230       Turks and Caicos Islands
2652      Wallis and Futuna Islands
4215                  French Guiana
5467                 Canary Islands
5512                  New Caledonia
5933                       Anguilla
7348                   Saint Helena
8566       Northern Mariana Islands
12976                       Mayotte
18084              Saint Barthélemy
18090    Saint Martin (French Part)
18092     Sint Maarten (Dutch part)
18331                       Curaçao
Name: Country, dtype: object


In [9]:
print(len(not_in))

4672


In [10]:
print(len(not_in_countries))

4782


In [29]:
STOPWORDS = {'republic', 'of', 'the', 'and', 'is', 'democratic','arab','united','states','saint','islands','kingdom','new','peoples'}  # add more if needed

# Function to split into words and remove punctuation
def words_set(name):
    if pd.isna(name):
        return set()
    name = name.lower()
    name = re.sub(r'[^\w\s]', '', name)  # remove punctuation
    return set(name.split())

# Prepare sets
cou_words = cou['country'].apply(words_set)
not_in_words = not_in_countries['Country'].apply(words_set)

matches = []

for i, ni_words in enumerate(not_in_words):
    ni_name = not_in_countries.iloc[i]['Country']
    for j, c_words in enumerate(cou_words):
        c_name = cou.iloc[j]['country']
        shared = ni_words & c_words
        # ignore matches that are only stopwords
        filtered_shared = shared - STOPWORDS
        if filtered_shared:  # only keep if something meaningful remains
            matches.append((ni_name, c_name, filtered_shared))

# Convert to DataFrame
matches_df = pd.DataFrame(matches, columns=['NotIn', 'InCou', 'SharedWords'])


In [30]:
# Convert SharedWords set to a sorted tuple (or string)
matches_df['SharedWordsStr'] = matches_df['SharedWords'].apply(lambda x: ', '.join(sorted(x)))

# Drop duplicates based on all columns or specific columns
matches_df = matches_df.drop_duplicates(subset=['NotIn', 'InCou', 'SharedWordsStr'])


In [35]:
# Keep rows where SharedWordsStr is unique (appears only once)
unique_matches = matches_df[
    ~matches_df['SharedWordsStr'].duplicated(keep=False)
]
unique_matches
correction_map = dict(zip(unique_matches['NotIn'], unique_matches['InCou']))
correction_map

{'Gambia': 'THE GAMBIA',
 'Côte d’Ivoire': "CÔTE D'IVOIRE",
 'Bolivia (Plurinational State of)': 'BOLIVIA',
 'Iran (Islamic Republic of)': 'ISLAMIC REPUBLIC OF IRAN',
 'Venezuela (Bolivarian Republic of)': 'VENEZUELA',
 'United Kingdom of Great Britain and Northern Ireland': 'IRELAND',
 'China, Hong Kong Special Administrative Region': 'HONG KONG, CHINA',
 'Micronesia (Federated States of)': 'FEDERATED STATES OF MICRONESIA',
 'Bahamas': 'THE BAHAMAS',
 'Saudi Arabia': 'KINGDOM OF SAUDI ARABIA',
 'Serbia Montenegro': 'SERBIA AND MONTENEGRO',
 'State of Palestine': 'PALESTINE'}

In [36]:
nd['Country'] = nd['Country'].apply(lambda x: correction_map.get(x, x))

In [37]:
nd['country_norm'] = nd['Country'].apply(normalize_country)
cou['country_norm'] = cou['country'].apply(normalize_country)

not_in = nd[~nd['country_norm'].isin(cou['country_norm'])]
print(not_in['Country'].drop_duplicates())

9                               Yemen Arab Republic
28                                          Réunion
32                                          Tokelau
35                                            China
61                         United States of America
132                      Taiwan (Province of China)
134                                        Viet Nam
256                                  Azores Islands
352                                French Polynesia
427                        Germany Federal Republic
630                                      Guadeloupe
804           People's Democratic Republic of Yemen
835                                      Montserrat
836                                      Martinique
1089                     German Democratic Republic
1195                                           Niue
1230                       Turks and Caicos Islands
1406                   Netherlands (Kingdom of the)
2652                      Wallis and Futuna Islands
2923     Chi

In [56]:
STOPWORDS = {'republic', 'of', 'the', 'and', 'is', 'democratic','arab','united','states','saint','islands','kingdom','new','peoples'}  # add more if needed

# Function to split into words and remove punctuation
def words_set(name):
    if pd.isna(name):
        return set()
    name = name.lower()
    name = re.sub(r'[^\w\s]', '', name)  # remove punctuation
    return set(name.split())

# Prepare sets
cou_words = cou['country'].apply(words_set)
not_in_words = not_in['Country'].apply(words_set)

matches = []

for i, ni_words in enumerate(not_in_words):
    ni_name = not_in.iloc[i]['Country']
    iso = not_in.iloc[i]['ISO']
    for j, c_words in enumerate(cou_words):
        c_name = cou.iloc[j]['country']
        noc = cou.iloc[j]['noc']
        shared = ni_words & c_words
        # ignore matches that are only stopwords
        filtered_shared = shared - STOPWORDS
        if filtered_shared:  # only keep if something meaningful remains
            matches.append((ni_name, c_name, filtered_shared,iso,noc))

# Convert to DataFrame
matches_df = pd.DataFrame(matches, columns=['NotIn', 'InCou', 'SharedWords','ISO','noc'])


In [57]:
# Convert SharedWords set to a sorted tuple (or string)
matches_df['SharedWordsStr'] = matches_df['SharedWords'].apply(lambda x: ', '.join(sorted(x)))

# Now drop duplicates based on all columns or specific columns
matches_df = matches_df.drop_duplicates(subset=['NotIn', 'InCou', 'SharedWordsStr','noc','ISO'])


In [58]:
matches_df

Unnamed: 0,NotIn,InCou,SharedWords,ISO,noc,SharedWordsStr
0,Yemen Arab Republic,NORTH YEMEN,{yemen},YMN,YAR,yemen
1,Yemen Arab Republic,SOUTH YEMEN,{yemen},YMN,YMD,yemen
2,Yemen Arab Republic,YEMEN,{yemen},YMN,YEM,yemen
3,China,"HONG KONG, CHINA",{china},CHN,HKG,china
4,China,PEOPLE'S REPUBLIC OF CHINA,{china},CHN,CHN,china
33,Taiwan (Province of China),"HONG KONG, CHINA",{china},TWN,HKG,china
34,Taiwan (Province of China),PEOPLE'S REPUBLIC OF CHINA,{china},TWN,CHN,china
73,Germany Federal Republic,EAST GERMANY,{germany},DFR,GDR,germany
74,Germany Federal Republic,GERMANY,{germany},DFR,GER,germany
75,Germany Federal Republic,WEST GERMANY,{germany},DFR,FRG,germany


In [50]:
nd[nd["Country"]=="Yemen Arab Republic"]["ISO"]

9       YMN
1061    YMN
Name: ISO, dtype: object

In [51]:
nd[nd["Country"]=="People's Democratic Republic of Yemen"]["ISO"]

804    YMD
Name: ISO, dtype: object

In [27]:
CANONICAL_COUNTRY_MAP_COU = {
    # Yemen
    "NORTH YEMEN": "YEMEN",
    "SOUTH YEMEN": "YEMEN",

    # Germany
    "EAST GERMANY": "GERMANY",
    "WEST GERMANY": "GERMANY",
    "germany":"GERMANY",

    # Vietnam
    "SOUTH VIETNAM": "VIETNAM",
    
}
CANONICAL_NOC_MAP_COU = {
    # Yemen
    "YAR": "YEM",
    "YMD": "YEM",

    # Germany
    "GDR": "GER",
    "FRG": "GER",

    # Vietnam
    "VNM": "VIE",
}

CANONICAL_COUNTRY_MAP_ND = {
    # Germany
    "Germany Federal Republic": "GERMANY",
    "German Democratic Republic": "GERMANY",
    
    # Yemen
    "Yemen Arab Republic": "YEMEN",
    "People's Democratic Republic of Yemen": "YEMEN",

    # China
    "Taiwan (Province of China)": "PEOPLE'S REPUBLIC OF CHINA",
    "China, Macao Special Administrative Region": "PEOPLE'S REPUBLIC OF CHINA",
    "China": "PEOPLE'S REPUBLIC OF CHINA",

    # Netherlands
    "Netherlands (Kingdom of the)": "NETHERLANDS",

    
    # USA
    "United States of America": "UNITED STATES",

    # Vietnam
    "Viet Nam": "VIETNAM",

    #Guineau bissau
    "Guinea-Bissau":"GUINEA BISSAU",

    
    
}
CANONICAL_ISO_MAP_ND = {
    # Yemen
    "YMN": "YEM",
    "YMD": "YEM",

    # China
    "TWN": "CHN",
    "MAC": "CHN",
}

In [19]:
test=staging = Path("../test/cleaned")
matches_df.to_csv(test / "test.csv", index=False)

In [46]:


bio = pd.read_csv(staging /"Olympic_Athlete_Bio.csv")
res = pd.read_csv(staging /"Olympic_Athlete_Event_Results.csv")


In [104]:
bio['country_noc'] = bio['country_noc'].replace(CANONICAL_NOC_MAP_COU)
res['country_noc'] = res['country_noc'].replace(CANONICAL_NOC_MAP_COU)


In [105]:
cou['noc'] = cou['noc'].replace(CANONICAL_NOC_MAP_COU)
cou['country'] = cou['country'].replace(CANONICAL_COUNTRY_MAP_COU)


In [106]:
nd['Country'] = nd['Country'].replace(CANONICAL_COUNTRY_MAP_ND)
nd['ISO'] = nd['ISO'].replace(CANONICAL_ISO_MAP_ND)


In [107]:
nd[nd['Country']=="Germany Federal Republic"]

Unnamed: 0,DisNo.,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Event Name,ISO,Country,Subregion,Region,Location,Origin,Magnitude,Magnitude Scale,Latitude,Longitude,Start Date,End Date,country_norm


In [108]:
nd['country_norm'] = nd['Country'].apply(normalize_country)
cou['country_norm'] = cou['country'].apply(normalize_country)

not_in = nd[~nd['country_norm'].isin(cou['country_norm'])]
print(not_in[['Country','ISO']].drop_duplicates())

                          Country  ISO
28                        Réunion  REU
32                        Tokelau  TKL
256                Azores Islands  AZO
352              French Polynesia  PYF
630                    Guadeloupe  GLP
835                    Montserrat  MSR
836                    Martinique  MTQ
1195                         Niue  NIU
1230     Turks and Caicos Islands  TCA
2652    Wallis and Futuna Islands  WLF
4215                French Guiana  GUF
5467               Canary Islands  SPI
5512                New Caledonia  NCL
5933                     Anguilla  AIA
7348                 Saint Helena  SHN
8566     Northern Mariana Islands  MNP
12976                     Mayotte  MYT
18084            Saint Barthélemy  BLM
18090  Saint Martin (French Part)  MAF
18092   Sint Maarten (Dutch part)  SXM
18331                     Curaçao  CUW


In [109]:
print(len(bio))
print(len(bio[bio['country_noc'].isin(cou['noc'])]))
print(len(res))
print(len(res[res['country_noc'].isin(cou['noc'])]))


155861
155861
316834
316834


In [99]:
STOPWORDS = {'republic', 'of', 'the', 'and', 'is', 'democratic','arab','united','states','saint','islands','kingdom','new','peoples'}  # add more if needed

# Function to split into words and remove punctuation
def words_set(name):
    if pd.isna(name):
        return set()
    name = name.lower()
    name = re.sub(r'[^\w\s]', '', name)  # remove punctuation
    return set(name.split())

# Prepare sets
cou_words = cou['country'].apply(words_set)
not_in_words = not_in['Country'].apply(words_set)

matches = []

for i, ni_words in enumerate(not_in_words):
    ni_name = not_in.iloc[i]['Country']
    iso = not_in.iloc[i]['ISO']
    for j, c_words in enumerate(cou_words):
        c_name = cou.iloc[j]['country']
        noc = cou.iloc[j]['noc']
        shared = ni_words & c_words
        # ignore matches that are only stopwords
        filtered_shared = shared - STOPWORDS
        if filtered_shared:  # only keep if something meaningful remains
            matches.append((ni_name, c_name, filtered_shared,iso,noc))

# Convert to DataFrame
matches_df = pd.DataFrame(matches, columns=['NotIn', 'InCou', 'SharedWords','ISO','noc'])


In [100]:
# Convert SharedWords set to a sorted tuple (or string)
matches_df['SharedWordsStr'] = matches_df['SharedWords'].apply(lambda x: ', '.join(sorted(x)))

# Now drop duplicates based on all columns or specific columns
matches_df = matches_df.drop_duplicates(subset=['NotIn', 'InCou', 'SharedWordsStr','noc','ISO'])


In [101]:
matches_df

Unnamed: 0,NotIn,InCou,SharedWords,ISO,noc,SharedWordsStr


In [62]:
print("NORTH YEMEN")
print(bio[bio["country_noc"]=="YAR"])
print("==========================================")
print(res[res["country_noc"]=="YAR"])

NORTH YEMEN
Empty DataFrame
Columns: [athlete_id, name, sex, country_noc]
Index: []
Empty DataFrame
Columns: [edition, edition_id, country_noc, sport, event, result_id, athlete_id, medal]
Index: []


In [63]:
print("SOUTH YEMEN")
print(bio[bio["country_noc"]=="YMD"])
print("==========================================")
print(res[res["country_noc"]=="YMD"])

SOUTH YEMEN
Empty DataFrame
Columns: [athlete_id, name, sex, country_noc]
Index: []
Empty DataFrame
Columns: [edition, edition_id, country_noc, sport, event, result_id, athlete_id, medal]
Index: []


In [64]:
print("YEMEN")
print(bio[bio["country_noc"]=="YEM"])
print("==========================================")
print(res[res["country_noc"]=="YEM"])

YEMEN
        athlete_id                    name     sex country_noc
1619         79345      Farouk Ahmed Sayed    Male         YEM
2843         33393      Mansour Al-Soraihi    Male         YEM
7003        120747  Fatima Suleiman Dahman  Female         YEM
11125        33390        Salah Al-Humaidi    Male         YEM
11536       112818            Ali Khousrof    Male         YEM
12952       147605          Ahmed Al-Yaari    Male         YEM
15137       120748          Nabil Al-Garbi    Male         YEM
15426       115890    Abdulsalam Al-Gadabi    Male         YEM
19442       115321      Mohammed Al-Yafaee    Male         YEM
21979        79339        Mohamed Al-Saadi    Male         YEM
24594        34099          Mohamed Moslih    Male         YEM
28746        60836       Abdullah Al-Ghrbi    Male         YEM
29471        60837       Abdullah Al-Izani    Male         YEM
30757       120746        Tameem Al-Kubati    Male         YEM
31120        33939         Mohamed Kohsrof    Mal

In [None]:
print("NORTH YEMEN")

In [111]:
staging = Path("../data/staging")
cou = pd.read_csv(staging /"Olympics_Country.csv")
nd = pd.read_csv(staging /"02_normalize_date.csv")

In [112]:
cou['noc'] = cou['noc'].replace(CANONICAL_NOC_MAP_COU)
cou['country'] = cou['country'].replace(CANONICAL_COUNTRY_MAP_COU)
nd['Country'] = nd['Country'].replace(CANONICAL_COUNTRY_MAP_ND)
nd['ISO'] = nd['ISO'].replace(CANONICAL_ISO_MAP_ND)


In [123]:
nd['country_norm'] = nd['Country'].apply(normalize_country)
cou['country_norm'] = cou['country'].apply(normalize_country)

not_in = nd[~nd['country_norm'].isin(cou['country_norm'])]
print(not_in[['Country','ISO']].drop_duplicates())

                          Country  ISO
28                        Réunion  REU
32                        Tokelau  TKL
256                Azores Islands  AZO
352              French Polynesia  PYF
630                    Guadeloupe  GLP
835                    Montserrat  MSR
836                    Martinique  MTQ
1195                         Niue  NIU
1230     Turks and Caicos Islands  TCA
2652    Wallis and Futuna Islands  WLF
4215                French Guiana  GUF
5467               Canary Islands  SPI
5512                New Caledonia  NCL
5933                     Anguilla  AIA
7348                 Saint Helena  SHN
8566     Northern Mariana Islands  MNP
12976                     Mayotte  MYT
18084            Saint Barthélemy  BLM
18090  Saint Martin (French Part)  MAF
18092   Sint Maarten (Dutch part)  SXM
18331                     Curaçao  CUW


In [120]:
STOPWORDS = {'republic', 'of', 'the', 'and', 'is', 'democratic','arab','united','states','saint','islands','kingdom','new','peoples'}  # add more if needed

# Function to split into words and remove punctuation
def words_set(name):
    if pd.isna(name):
        return set()
    name = name.lower()
    name = re.sub(r'[^\w\s]', '', name)  # remove punctuation
    return set(name.split())

# Prepare sets
cou_words = cou['country'].apply(words_set)
not_in_words = not_in['Country'].apply(words_set)

matches = []

for i, ni_words in enumerate(not_in_words):
    ni_name = not_in.iloc[i]['Country']
    iso = not_in.iloc[i]['ISO']
    for j, c_words in enumerate(cou_words):
        c_name = cou.iloc[j]['country']
        noc = cou.iloc[j]['noc']
        shared = ni_words & c_words
        # ignore matches that are only stopwords
        filtered_shared = shared - STOPWORDS
        if filtered_shared:  # only keep if something meaningful remains
            matches.append((ni_name, c_name, filtered_shared,iso,noc))

# Convert to DataFrame
matches_df = pd.DataFrame(matches, columns=['NotIn', 'InCou', 'SharedWords','ISO','noc'])


In [121]:
# Convert SharedWords set to a sorted tuple (or string)
matches_df['SharedWordsStr'] = matches_df['SharedWords'].apply(lambda x: ', '.join(sorted(x)))

# Now drop duplicates based on all columns or specific columns
matches_df = matches_df.drop_duplicates(subset=['NotIn', 'InCou', 'SharedWordsStr','noc','ISO'])


In [122]:
matches_df

Unnamed: 0,NotIn,InCou,SharedWords,ISO,noc,SharedWordsStr


In [117]:
# Keep rows where SharedWordsStr is unique (appears only once)
unique_matches = matches_df[
    ~matches_df['SharedWordsStr'].duplicated(keep=False)
]
unique_matches
correction_map = dict(zip(unique_matches['NotIn'], unique_matches['InCou']))
correction_map

{'Gambia': 'THE GAMBIA',
 'Bolivia (Plurinational State of)': 'BOLIVIA',
 'Iran (Islamic Republic of)': 'ISLAMIC REPUBLIC OF IRAN',
 'Venezuela (Bolivarian Republic of)': 'VENEZUELA',
 'United Kingdom of Great Britain and Northern Ireland': 'IRELAND',
 'China, Hong Kong Special Administrative Region': "PEOPLE'S REPUBLIC OF CHINA",
 'Micronesia (Federated States of)': 'FEDERATED STATES OF MICRONESIA',
 'Bahamas': 'THE BAHAMAS',
 'Saudi Arabia': 'KINGDOM OF SAUDI ARABIA',
 'Serbia Montenegro': 'SERBIA AND MONTENEGRO',
 'State of Palestine': 'PALESTINE'}

In [118]:
nd['Country'] = nd['Country'].apply(lambda x: correction_map.get(x, x))

In [124]:
nd.isna().sum()

DisNo.                   0
Disaster Group           0
Disaster Subgroup        0
Disaster Type            0
Disaster Subtype         0
Event Name           15340
ISO                      0
Country                  0
Subregion                0
Region                   0
Location              1537
Origin               17970
Magnitude            18251
Magnitude Scale       8906
Latitude             20516
Longitude            20516
Start Date               0
End Date                 0
country_norm             0
dtype: int64

In [130]:
len(cou)

230

In [129]:
cou=cou.drop_duplicates()

In [2]:
import pandas as pd
from pathlib import Path
staging = Path("../data/staging")


cou = pd.read_csv(staging /"Olympics_Country.csv")
nd = pd.read_csv(staging /"02_normalize_date.csv")
bio = pd.read_csv(staging /"Olympic_Athlete_Bio.csv")
res = pd.read_csv(staging /"Olympic_Athlete_Event_Results.csv")


In [9]:
not_in_countries = nd[
    ~nd['Country'].str.lower().isin(cou['country'].str.lower())
]
len(not_in_countries['Country'].drop_duplicates())

24

In [5]:
not_in_countries['Country'].drop_duplicates()

10                    Guinea-Bissau
15                    Côte d’Ivoire
28                          Réunion
32                          Tokelau
256                  Azores Islands
352                French Polynesia
630                      Guadeloupe
835                      Montserrat
836                      Martinique
844           Sao Tome and Principe
1195                           Niue
1230       Turks and Caicos Islands
2652      Wallis and Futuna Islands
4215                  French Guiana
5467                 Canary Islands
5512                  New Caledonia
5933                       Anguilla
7348                   Saint Helena
8566       Northern Mariana Islands
12976                       Mayotte
18084              Saint Barthélemy
18090    Saint Martin (French Part)
18092     Sint Maarten (Dutch part)
18331                       Curaçao
Name: Country, dtype: object

In [10]:
nd['country_norm'] = nd['Country'].apply(normalize_country)
not_in_countries['country_norm'] = not_in_countries['Country'].apply(normalize_country)
cou['country_norm'] = cou['country'].apply(normalize_country)

not_in = nd[~nd['country_norm'].isin(cou['country_norm'])]
is_now_in = not_in_countries[not_in_countries['country_norm'].isin(cou['country_norm'])]

print(is_now_in['Country'].drop_duplicates())

10             Guinea-Bissau
15             Côte d’Ivoire
844    Sao Tome and Principe
Name: Country, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  not_in_countries['country_norm'] = not_in_countries['Country'].apply(normalize_country)


In [19]:
STOPWORDS = {'republic', 'of', 'the', 'and', 'is', 'democratic','arab','united','states','saint','islands','kingdom','new','peoples'}  # add more if needed

# Function to split into words and remove punctuation
def words_set(name):
            
    name = normalize_country(name)
    return set(name.split())

# Prepare sets
cou_words = cou['country'].apply(words_set)
not_in_words = not_in_countries['Country'].apply(words_set)

matches = []

for i, ni_words in enumerate(not_in_words):
    ni_name = not_in_countries.iloc[i]['Country']
    iso = not_in_countries.iloc[i]['ISO']
    for j, c_words in enumerate(cou_words):
        c_name = cou.iloc[j]['country']
        noc = cou.iloc[j]['noc']
        shared = ni_words & c_words
        # ignore matches that are only stopwords
        filtered_shared = shared - STOPWORDS
        if filtered_shared:  # only keep if something meaningful remains
            matches.append((ni_name, c_name, filtered_shared,iso,noc))

# Convert to DataFrame
matches_df = pd.DataFrame(matches, columns=['NotIn', 'InCou', 'SharedWords','ISO','noc'])


In [22]:
# Convert SharedWords set to a sorted tuple (or string)
matches_df['SharedWordsStr'] = matches_df['SharedWords'].apply(lambda x: ', '.join(sorted(x)))

# Now drop duplicates based on all columns or specific columns
matches_df = matches_df.drop_duplicates(subset=['NotIn', 'InCou', 'SharedWordsStr','noc','ISO'])


TypeError: unhashable type: 'set'

In [23]:
 not_in_countries

Unnamed: 0,DisNo.,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Event Name,ISO,Country,Subregion,Region,Location,Origin,Magnitude,Magnitude Scale,Latitude,Longitude,Start Date,End Date,country_norm
10,1987-0015-GNB,Natural,Biological,Infestation,Grasshopper infestation,Grasshopper,GNB,Guinea-Bissau,Sub-Saharan Africa,Africa,,,,,,,1987-01-01,1987-01-01,guinea bissau
15,1987-0040-CIV,Technological,Transport,Air,Air,Boeing 707,CIV,Côte d’Ivoire,Sub-Saharan Africa,Africa,Near Abidjan,,,,,,1987-01-03,1987-01-03,cote divoire
28,1987-0060-REU,Natural,Meteorological,Storm,Tropical cyclone,,REU,Réunion,Sub-Saharan Africa,Africa,Saint Denis,,,Kph,,,1987-02-13,1987-02-13,reunion
32,1987-0064-TKL,Natural,Meteorological,Storm,Storm (General),,TKL,Tokelau,Polynesia,Oceania,"Fakaofo, Nukumono, Atafu Atolls",,,Kph,,,1987-02-27,1987-02-27,tokelau
73,1987-0118-GNB,Natural,Meteorological,Storm,Tropical cyclone,,GNB,Guinea-Bissau,Sub-Saharan Africa,Africa,Tombali,,,Kph,,,1987-07-01,1987-07-01,guinea bissau
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22474,2025-0387-CIV,Natural,Hydrological,Mass movement (wet),Landslide (wet),,CIV,Côte d’Ivoire,Sub-Saharan Africa,Africa,Abidjan district,Heavy rain,,,,,2025-05-20,2025-05-21,cote divoire
22506,2025-0438-CIV,Technological,Transport,Road,Road,,CIV,Côte d’Ivoire,Sub-Saharan Africa,Africa,Near Agboville,,,,,,2025-06-14,2025-06-14,cote divoire
22575,2025-0617-CIV,Technological,Transport,Road,Road,,CIV,Côte d’Ivoire,Sub-Saharan Africa,Africa,Between Katiola and Niakara,,,,,,2025-07-27,2025-07-28,cote divoire
22636,2025-0776-CIV,Technological,Transport,Water,Water,,CIV,Côte d’Ivoire,Sub-Saharan Africa,Africa,"On river Sassandra, near Buyo",,,,,,2025-09-05,2025-09-05,cote divoire


In [26]:
matches_df

Unnamed: 0,NotIn,InCou,SharedWords,ISO,noc,SharedWordsStr
0,Guinea-Bissau,EQUATORIAL GUINEA,{guinea},GNB,GEQ,guinea
1,Guinea-Bissau,GUINEA,{guinea},GNB,GUI,guinea
2,Guinea-Bissau,GUINEA BISSAU,"{bissau, guinea}",GNB,GBS,"bissau, guinea"
3,Guinea-Bissau,PAPUA NEW GUINEA,{guinea},GNB,PNG,guinea
4,Côte d’Ivoire,CÔTE D'IVOIRE,"{cote, divoire}",CIV,CIV,"cote, divoire"
17,Sao Tome and Principe,SÃO TOMÉ AND PRÍNCIPE,"{sao, tome, principe}",STP,STP,"principe, sao, tome"


In [25]:
# Keep rows where SharedWordsStr is unique (appears only once)
unique_matches = matches_df[
    ~matches_df['SharedWordsStr'].duplicated(keep=False)
]
unique_matches
correction_map = dict(zip(unique_matches['NotIn'], unique_matches['InCou']))
correction_map

{'Guinea-Bissau': 'GUINEA BISSAU',
 'Côte d’Ivoire': "CÔTE D'IVOIRE",
 'Sao Tome and Principe': 'SÃO TOMÉ AND PRÍNCIPE'}

In [29]:
print(len(bio))
print(len(bio[bio['country_noc'].isin(cou['noc'])]))
print(len(res))
print(len(res[res['country_noc'].isin(cou['noc'])]))


155861
155861
316834
316834


In [30]:
games = pd.read_csv(staging /"Olympics_Games.csv")
print(len(games))
print(len(games[games['country_noc'].isin(cou['noc'])]))


56
55


In [31]:
games[~games['country_noc'].isin(cou['noc'])]

Unnamed: 0,edition_id,country_noc,competition_start_date,competition_end_date
16,18,FRG,1972-08-26,1972-09-11


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import unicodedata
import json

# ============================
# 1. LOAD RAW DATA
# ============================
# Adjust paths to your docker/airflow layout
staging = Path("../data/staging")


games = pd.read_csv(staging /"Olympic_Athlete_Event_Results.csv")

games.isna().sum()


edition             0
edition_id          0
country_noc         0
sport               0
event               0
result_id           0
athlete_id          0
medal          272147
dtype: int64

In [3]:
games['medal'].unique()

array([nan, 'Bronze', 'Silver', 'Gold'], dtype=object)

In [37]:
import pandas as pd
import numpy as np
from pathlib import Path
import unicodedata
import json

# ============================
# 1. LOAD RAW DATA
# ============================
# Adjust paths to your docker/airflow layout
staging = Path("../data/landing")


res = pd.read_csv(staging /"Olympic_Athlete_Event_Results.csv")

res.isna().sum()

edition             0
edition_id          0
country_noc         0
sport               0
event               0
result_id           0
athlete             0
athlete_id          0
pos                 0
medal          272147
isTeamSport         0
dtype: int64

In [38]:
len(res)

316834

In [39]:
res.nunique()

edition            55
edition_id         55
country_noc       231
sport             112
event             964
result_id        7397
athlete        154213
athlete_id     155867
pos              2449
medal               3
isTeamSport         2
dtype: int64

In [40]:
len(res.drop_duplicates())

315626

In [41]:
res=res.drop_duplicates()

In [42]:
cols = ['athlete_id', 'edition','edition_id', 'sport', 'event','result_id','country_noc','medal']

duplicates = res[res.duplicated(subset=cols, keep=False)]
duplicates

Unnamed: 0,edition,edition_id,country_noc,sport,event,result_id,athlete,athlete_id,pos,medal,isTeamSport
37884,1936 Summer Olympics,11,BEL,Art Competitions,"Sculpturing, Medals, Open",920057,Willy Kreitz,88615,AC,,False
37889,1936 Summer Olympics,11,BEL,Art Competitions,"Sculpturing, Medals, Open",920057,Willy Kreitz,88615,HC,,False
53895,1948 Summer Olympics,12,DEN,Art Competitions,"Sculpturing, Statues, Open",920073,Knud Nellemose,920234,HM,,False
53897,1948 Summer Olympics,12,DEN,Art Competitions,"Sculpturing, Statues, Open",920073,Knud Nellemose,920234,AC,,False
57474,1936 Summer Olympics,11,AUT,Art Competitions,"Sculpturing, Statues, Open",920056,Josef Humplik,920032,HM,,False
...,...,...,...,...,...,...,...,...,...,...,...
316829,2022 Winter Olympics,62,NED,Bobsleigh,"Monobob, Women",19019671,Karlien Sleper,148662,16,,False
316830,2022 Winter Olympics,62,SVK,Bobsleigh,"Monobob, Women",19019671,Viktória Čerňanská,138683,17,,False
316831,2022 Winter Olympics,62,KOR,Bobsleigh,"Monobob, Women",19019671,Kim Yu-Ran,137542,18,,False
316832,2022 Winter Olympics,62,JAM,Bobsleigh,"Monobob, Women",19019671,Jazmine Fenlator-Victorian,128682,19,,False


In [9]:
bio[['name','sex','country_noc']].drop_duplicates()

Unnamed: 0,name,sex,country_noc
0,Ivanka Bonova,FEMALE,BUL
1,Nataliya Uryadova,FEMALE,RUS
2,Essa Ismail Rashed,MALE,QAT
3,Péter Boros,MALE,HUN
4,Rudolf Piowatý,MALE,TCH
...,...,...,...
155856,Todd Makler,MALE,USA
155857,Géza Hollósi,MALE,HUN
155858,József Keresztessy,MALE,HUN
155859,Alexander Thieme,MALE,GER


In [44]:
import pandas as pd
import numpy as np
from pathlib import Path
import unicodedata
import json

# ============================
# 1. LOAD RAW DATA
# ============================
# Adjust paths to your docker/airflow layout
staging = Path("../data/staging")



res = pd.read_csv(staging /"02_normalize_date.csv")
nd.isna().sum()

DisNo.                   0
Disaster Group           0
Disaster Subgroup        0
Disaster Type            0
Disaster Subtype         0
Event Name           15340
ISO                      0
Country                  0
Subregion                0
Region                   0
Location              1537
Origin               17970
Magnitude            18251
Magnitude Scale       8906
Latitude             20516
Longitude            20516
Start Date               0
End Date                 0
dtype: int64

In [45]:
nd.nunique()

DisNo.               22710
Disaster Group           2
Disaster Subgroup        9
Disaster Type           31
Disaster Subtype        66
Event Name            3118
ISO                    227
Country                226
Subregion               17
Region                   5
Location             18470
Origin                 870
Magnitude             1863
Magnitude Scale          6
Latitude              1940
Longitude             1935
Start Date           10213
End Date             10252
dtype: int64

In [46]:
len(nd)

22710

In [47]:
res = pd.read_csv(staging /"Olympic_Athlete_Event_Results.csv")

In [48]:
res.isna().sum()

edition             0
edition_id          0
country_noc         0
sport               0
event               0
result_id           0
athlete_id          0
medal          270876
dtype: int64

In [50]:
len(res['medal'].notnull())

315559

In [52]:
res[res['medal'].notnull()]

Unnamed: 0,edition,edition_id,country_noc,sport,event,result_id,athlete_id,medal
20,1908 Summer Olympics,5,ANZ,Athletics,"3,500 metres Race Walk, Men",56421,64719,Bronze
30,1908 Summer Olympics,5,ANZ,Boxing,"Middleweight, Men",21263,45153,Silver
32,1908 Summer Olympics,5,ANZ,Rugby,"Rugby, Men",31505,11237,Gold
33,1908 Summer Olympics,5,ANZ,Rugby,"Rugby, Men",31505,11239,Gold
34,1908 Summer Olympics,5,ANZ,Rugby,"Rugby, Men",31505,11240,Gold
...,...,...,...,...,...,...,...,...
315546,2022 Winter Olympics,62,USA,Speed Skating,"Team Pursuit (8 laps), Men",19020410,149164,Bronze
315547,2022 Winter Olympics,62,USA,Speed Skating,"Team Pursuit (8 laps), Men",19020410,128783,Bronze
315548,2022 Winter Olympics,62,USA,Speed Skating,"Team Pursuit (8 laps), Men",19020410,128784,Bronze
315549,2022 Winter Olympics,62,USA,Speed Skating,"500 metres, Women",19020424,138378,Gold


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import unicodedata
import json

# ============================
# 1. LOAD RAW DATA
# ============================
# Adjust paths to your docker/airflow layout
staging = Path("../data/staging")



nd = pd.read_csv(staging /"02_normalize_date.csv")

nd.isna().sum()

DisNo.                   0
Disaster Group           0
Disaster Subgroup        0
Disaster Type            0
Disaster Subtype         0
Event Name           15340
ISO                      0
Country                  0
Subregion                0
Region                   0
Location              1537
Origin               17970
Magnitude            18251
Magnitude Scale       8906
Latitude             20516
Longitude            20516
Start Date               0
End Date                 0
dtype: int64

In [2]:
nd.nunique()

DisNo.               22710
Disaster Group           2
Disaster Subgroup        9
Disaster Type           31
Disaster Subtype        66
Event Name            3118
ISO                    227
Country                226
Subregion               17
Region                   5
Location             18470
Origin                 870
Magnitude             1863
Magnitude Scale          6
Latitude              1940
Longitude             1935
Start Date           10213
End Date             10252
dtype: int64

In [3]:
len(nd)

22710

In [9]:
nd['Magnitude Scale'].drop_duplicates()

0                  Km2
1                  NaN
16                 Kph
23          Vaccinated
26    Moment Magnitude
35                  m3
65                  °C
Name: Magnitude Scale, dtype: object

In [13]:
nd[nd['Magnitude Scale']=='Moment Magnitude']

Unnamed: 0,DisNo.,Disaster Group,Disaster Subgroup,Disaster Type,Disaster Subtype,Event Name,ISO,Country,Subregion,Region,Location,Origin,Magnitude,Magnitude Scale,Latitude,Longitude,Start Date,End Date
26,1987-0058-PNG,Natural,Geophysical,Earthquake,Ground movement,,PNG,Papua New Guinea,Melanesia,Oceania,"Umboi Isl., Finschafen area of Huon peninsula",,7.4,Moment Magnitude,-6.088,147.689,1987-02-09,1987-02-09
34,1987-0068-NZL,Natural,Geophysical,Earthquake,Ground movement,,NZL,New Zealand,Australia and New Zealand,Oceania,Edgecumbe (Bay of Plenty),,6.5,Moment Magnitude,-37.965,176.765,1987-03-02,1987-03-02
36,1987-0070-ECU,Natural,Geophysical,Earthquake,Ground movement,,ECU,Ecuador,Latin America and the Caribbean,Americas,"Carchi, Imbabura, Pastaza, Napo provinces",,7.2,Moment Magnitude,0.151,-77.785,1987-03-05,1987-03-05
48,1987-0086-IDN,Natural,Geophysical,Earthquake,Ground movement,,IDN,Indonesia,South-eastern Asia,Asia,Tarutung (North Sumatra),,6.4,Moment Magnitude,2.244,98.866,1987-04-26,1987-04-26
63,1987-0101-IRN,Natural,Geophysical,Earthquake,Ground movement,,IRN,ISLAMIC REPUBLIC OF IRAN,Southern Asia,Asia,NAHAVAND-HAMADEN-TUYSARKAN (Southwest Teheran),,4.6,Moment Magnitude,34.076,48.266,1987-05-29,1987-05-29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22663,2025-0844-PHL,Natural,Geophysical,Earthquake,Ground movement,,PHL,Philippines,South-eastern Asia,Asia,Bogo (Cebu Isl.),,6.9,Moment Magnitude,11.136,124.127,2025-10-01,2025-10-01
22672,2025-0867-ETH,Natural,Geophysical,Earthquake,Ground movement,,ETH,Ethiopia,Sub-Saharan Africa,Africa,Afar and Tigray regions,,5.7,Moment Magnitude,,,2025-10-11,2025-10-11
22674,2025-0870-PHL,Natural,Geophysical,Earthquake,Ground movement,,PHL,Philippines,South-eastern Asia,Asia,"Davao Oriental, Davao de Oro (Mindanao Island)",,6.7,Moment Magnitude,7.172,126.755,2025-10-10,2025-10-10
22681,2025-0882-IDN,Natural,Geophysical,Earthquake,Ground movement,,IDN,Indonesia,South-eastern Asia,Asia,Sarmi region (Papua),,6.5,Moment Magnitude,-2.168,138.938,2025-10-16,2025-10-16
