<h1 style="font-size: xxx-large">SMDA PROJECT:
DATA COLLECTION AND PREPROCESSING</h1>


<h1 style="font-size:-large">Loading Packages</h1>

In [1]:
import requests
import pandas as pd
from datetime import datetime
import calendar
from matplotlib import pyplot as plt
from datetime import timedelta
import ast



<h1 style="font-size:-large">Retrieving List of Wikipedia Articles</h1>

In [None]:
API_URL = "https://en.wikipedia.org/w/api.php"

HEADERS = {
    "User-Agent": "AviationResearchBot/1.0 (-)"
}


def get_category_members(category, cmtype="page", limit=500):
    """
    Get members of a Wikipedia category.
    cmtype = "page" (articles), "subcat" (subcategories), or "file"
    """
    members = []
    cmcontinue = None

    while True:
        params = {
            "action": "query",
            "list": "categorymembers",
            "cmtitle": f"Category:{category}",
            "cmlimit": limit,
            "cmtype": cmtype,
            "format": "json"
        }
        if cmcontinue:
            params["cmcontinue"] = cmcontinue

        r = requests.get(API_URL, params=params, headers=HEADERS)

        if r.status_code != 200:
            print(f"HTTP error {r.status_code} for category {category}")
            break
        try:
            data = r.json()
        except Exception:
            print("JSON decode error. Raw response:")
            print(r.text[:500])
            break

        members.extend(data["query"]["categorymembers"])

        if "continue" in data:
            cmcontinue = data["continue"]["cmcontinue"]
            time.sleep(0.1) 
        else:
            break

    return members


def get_all_articles_from_category(category, depth=1, max_depth=2):
    """
    Recursively get all articles from a category, up to max_depth levels.
    depth=1 means root category.
    """
    print(f"{'  ' * (depth-1)}Fetching {category} (depth {depth})")

    articles = []
    articles.extend(get_category_members(category, cmtype="page"))

    if depth >= max_depth:
        return articles

    subcats = get_category_members(category, cmtype="subcat")

    for subcat in subcats:
        subcat_title = subcat["title"].replace("Category:", "")
        articles.extend(get_all_articles_from_category(subcat_title, depth+1, max_depth))

    return articles


articles_country = get_all_articles_from_category("Genocide remembrance days ", max_depth=0)
print(f"Found {len(articles_country)} articles")

all_articles = {a["pageid"]: a["title"] for a in (articles_country)}
print(f"Total unique articles collected: {len(all_articles)}")

df_articles = pd.DataFrame(list(all_articles.items()), columns=["pageid", "title"])
#df_articles.to_csv("Full_list_rememberance_days.csv", index=False)


Fetching Genocide remembrance days  (depth 1)
Found 11 articles
Total unique articles collected: 11


<h1 style="font-size:-large">Retrieving Hyperlinks of Wikipedia Articles</h1>

In [None]:
def get_wikipedia_links(pageid):
    url = "https://en.wikipedia.org/w/api.php"
    params = {
        "action": "query",
        "prop": "links",
        "pageids": pageid,
        "format": "json",
        "pllimit": "max"
    }
    headers = {
        "User-Agent": "MyWikipediaScraper/1.0 (-)"
    }

    links = []
    while True:
        response = requests.get(url, params=params, headers=headers)

        if response.status_code != 200:
            print(f"Error {response.status_code} for pageid {pageid}")
            return []

        try:
            data = response.json()
        except Exception as e:
            print(f"Could not decode JSON for pageid {pageid}: {e}")
            print("Response text:", response.text[:500])  
            return []

        pages = data.get("query", {}).get("pages", {})
        if str(pageid) in pages:
            page = pages[str(pageid)]
            if "links" in page:
                for link in page["links"]:
                    links.append(link["title"])

        if "continue" in data:
            params.update(data["continue"])
        else:
            break

    return links

df_articles["links"] = df_articles["pageid"].apply(get_wikipedia_links)

print(df_articles.head())


     pageid                                  title  \
0   4575104      Armenian Genocide Remembrance Day   
1  25966474                       Black Ribbon Day   
2  17539056             Circassian Day of Mourning   
3  53453368  Genocide Remembrance Day (Bangladesh)   
4  69380022                 Holodomor Memorial Day   

                                               links  
0  [100th anniversary of the Armenian Genocide, 1...  
1  [2020 Belarusian protests, 2022 Russian invasi...  
2  [Abzakhs, Ademey, Adygea, Adyghe Xabze, Adyghe...  
3  [1958 Pakistani coup d'état, 1969 Mass uprisin...  
4  [Bridget A. Brink, Canada, Causes of the Holod...  


In [26]:
articles = [title.replace(" ", "_") for title in all_articles.values()]

In [27]:
articles

['Armenian_Genocide_Remembrance_Day',
 'Black_Ribbon_Day',
 'Circassian_Day_of_Mourning',
 'Genocide_Remembrance_Day_(Bangladesh)',
 'Holodomor_Memorial_Day',
 'International_Day_of_Reflection_on_the_1994_Rwanda_Genocide',
 'Martyred_Intellectuals_Day',
 'National_Day_for_Truth_and_Reconciliation',
 'National_Day_of_Remembrance_(Cambodia)',
 'National_Sorry_Day',
 'White_Armband_Day']

<h1 style="font-size:-large">Retrieving daily Pageviews of Wikipedia Articles</h1>

In [370]:
def get_pageviews(article, start, end):
    """
    Get daily Wikipedia pageviews for one article between start and end dates.
    start, end format: YYYYMMDD
    """
    url = (
        f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/"
        f"en.wikipedia/all-access/all-agents/{article}/daily/{start}/{end}"
    )
    headers = {"User-Agent": "MyResearchBot/1.0 (your_email@example.com)"}

    r = requests.get(url, headers=headers)
    if r.status_code != 200:
        print(f"Error {r.status_code}: Could not fetch {article}")
        return pd.DataFrame()

    data = r.json()
    if "items" not in data:
        return pd.DataFrame()

    views = [
        {"date": item["timestamp"][:8], "views": item["views"]}
        for item in data["items"]
    ]

    df = pd.DataFrame(views)
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
    df["article"] = article
    return df

In [371]:
# Define  start and end dates
start_date = "20160101"
end_date = "20250101"

# Apply the function to all articles and concatenate the results
page_views = pd.concat([get_pageviews(article, start_date, end_date) for article in articles], ignore_index=True)

# Display the combined DataFrame
print(page_views)

Error 404: Could not fetch Day_of_Salvation_and_Liberation
            date  views         article
0     2016-01-01    205  Liberation_Day
1     2016-01-02    111  Liberation_Day
2     2016-01-03     82  Liberation_Day
3     2016-01-04    115  Liberation_Day
4     2016-01-05    130  Liberation_Day
...          ...    ...             ...
87907 2024-12-28     10    Waterloo_Day
87908 2024-12-29      7    Waterloo_Day
87909 2024-12-30      9    Waterloo_Day
87910 2024-12-31      8    Waterloo_Day
87911 2025-01-01      2    Waterloo_Day

[87912 rows x 3 columns]


In [372]:
page_views

Unnamed: 0,date,views,article
0,2016-01-01,205,Liberation_Day
1,2016-01-02,111,Liberation_Day
2,2016-01-03,82,Liberation_Day
3,2016-01-04,115,Liberation_Day
4,2016-01-05,130,Liberation_Day
...,...,...,...
87907,2024-12-28,10,Waterloo_Day
87908,2024-12-29,7,Waterloo_Day
87909,2024-12-30,9,Waterloo_Day
87910,2024-12-31,8,Waterloo_Day


In [373]:
page_views["date"] = pd.to_datetime(page_views["date"])

start_date = pd.to_datetime("2015-01-01")
end_date   = pd.to_datetime("2025-01-01")
full_range = pd.date_range(start=start_date, end=end_date, freq="D")

# Count number of days per article
coverage = (
    page_views.groupby("article")["date"]
    .nunique()
    .reset_index(name="n_days")
)



In [374]:
coverage

Unnamed: 0,article,n_days
0,Armed_Forces_Day_(Poland),3289
1,Day_of_Victory_in_the_Great_Fatherland_Liberat...,3289
2,Independence_Day_(Abkhazia),2534
3,Kargil_Vijay_Diwas,3289
4,Liberation_Day,3289
...,...,...
25,Victory_in_Europe_Day,3289
26,Victory_in_Europe_Day_(Israel),2456
27,Victory_over_Japan_Day,3289
28,Võidupüha,3204


In [375]:
coverage["n_days"].unique()

array([3289, 2534, 3287, 1380, 1679, 2469, 3276, 3139, 1700, 1490, 3285,
       2265, 3132, 2456, 3204, 3283])

In [151]:
expected_days

3654

<h1 style="font-size:-large">Retrieving Metadata of Wikipedia Articles</h1>

In [38]:
API_URL = "https://en.wikipedia.org/w/api.php"
WIKIDATA_API = "https://www.wikidata.org/w/api.php"
HEADERS = {"User-Agent": "RemembranceMetadataFetcher/1.0"}

# Helper: Wikidata label fetch
def get_label_from_wikidata(qid, language="en"):
    params = {
        "action": "wbgetentities",
        "ids": qid,
        "format": "json",
        "languages": language
    }
    r = requests.get(WIKIDATA_API, params=params, headers=HEADERS)
    data = r.json()
    try:
        return data["entities"][qid]["labels"][language]["value"]
    except:
        return None

def get_claims(qid):
    params = {"action": "wbgetclaims", "entity": qid, "format": "json"}
    r = requests.get(WIKIDATA_API, params=params, headers=HEADERS)
    return r.json().get("claims", {})

# Article categories + subcategories
def get_article_categories(title):
    params = {
        "action": "query",
        "titles": title,
        "prop": "categories",
        "cllimit": "max",
        "format": "json"
    }
    r = requests.get(API_URL, params=params, headers=HEADERS)
    data = r.json()
    pages = data.get("query", {}).get("pages", {})
    categories = []
    for _, page in pages.items():
        cats = page.get("categories", [])
        categories = [c["title"] for c in cats]
    return categories

def get_subcategories(category_title, depth=1):
    subcats = []
    params = {
        "action": "query",
        "list": "categorymembers",
        "cmtitle": category_title,
        "cmtype": "subcat",
        "cmlimit": "max",
        "format": "json"
    }
    r = requests.get(API_URL, params=params, headers=HEADERS)
    data = r.json()
    members = data.get("query", {}).get("categorymembers", [])
    subcats = [m["title"] for m in members]
    if depth > 1:
        for sc in subcats:
            subcats.extend(get_subcategories(sc, depth=depth-1))
    return subcats

# Wikidata metadata
def get_wikidata_metadata(title):
    # Step 1: Get Wikidata item ID
    params = {
        "action": "query",
        "titles": title,
        "prop": "pageprops",
        "format": "json"
    }
    r = requests.get(API_URL, params=params, headers=HEADERS)
    data = r.json()
    pages = data.get("query", {}).get("pages", {})
    wikidata_id = None
    for _, page in pages.items():
        wikidata_id = page.get("pageprops", {}).get("wikibase_item")
    if not wikidata_id:
        return {}

    claims = get_claims(wikidata_id)
    metadata = {}

    # Event date
    for pid in ["P585", "P729", "P837"]:  # point in time, start time, day of year
        if pid in claims:
            try:
                if pid == "P837":  # day of year stored as QID
                    qid = claims[pid][0]["mainsnak"]["datavalue"]["value"]["id"]
                    metadata["event_date"] = get_label_from_wikidata(qid)
                else:
                    metadata["event_date"] = claims[pid][0]["mainsnak"]["datavalue"]["value"]["time"]
                break
            except:
                pass

    # Country + continent
    if "P17" in claims:  # country
        try:
            country_qid = claims["P17"][0]["mainsnak"]["datavalue"]["value"]["id"]
            metadata["country"] = get_label_from_wikidata(country_qid)
            # Look up continent from country
            country_claims = get_claims(country_qid)
            if "P30" in country_claims:
                cont_qid = country_claims["P30"][0]["mainsnak"]["datavalue"]["value"]["id"]
                metadata["continent"] = get_label_from_wikidata(cont_qid)
        except:
            pass

    return metadata

# Article creation date
def get_article_creation_date(title):
    params = {
        "action": "query",
        "titles": title,
        "prop": "revisions",
        "rvprop": "timestamp",
        "rvlimit": "1",
        "format": "json"
    }
    r = requests.get(API_URL, params=params, headers=HEADERS)
    pages = r.json().get("query", {}).get("pages", {})
    for _, page in pages.items():
        revs = page.get("revisions", [])
        if revs:
            return revs[0]["timestamp"]
    return None

# Main function: fetch all metadata
def fetch_article_metadata(title, depth=1):
    meta = {
        "article": title,
        "creation_date": get_article_creation_date(title),
    }

    # Categories + subcategories
    cats = get_article_categories(title)
    subcats = []
    for c in cats:
        subcats.extend(get_subcategories(c, depth=depth))
    meta["categories"] = cats
    meta["subcategories"] = subcats

    # Wikidata metadata
    wd_meta = get_wikidata_metadata(title)
    meta.update(wd_meta)

    return meta

# Example usage
metadata = []
for title in df_articles["title"]:  # your list of articles
    print(f"Fetching metadata for {title}...")
    try:
        info = fetch_article_metadata(title, depth=1)
        metadata.append(info)
        time.sleep(0.2)
    except Exception as e:
        print(f"Error fetching {title}: {e}")

df_meta = pd.DataFrame(metadata)
print(df_meta.head())


Fetching metadata for Armenian Genocide Remembrance Day...
Fetching metadata for Black Ribbon Day...
Fetching metadata for Circassian Day of Mourning...
Fetching metadata for Genocide Remembrance Day (Bangladesh)...
Fetching metadata for Holodomor Memorial Day...
Fetching metadata for International Day of Reflection on the 1994 Rwanda Genocide...
Fetching metadata for Martyred Intellectuals Day...
Fetching metadata for National Day for Truth and Reconciliation...
Fetching metadata for National Day of Remembrance (Cambodia)...
Fetching metadata for National Sorry Day...
Fetching metadata for White Armband Day...
                                 article         creation_date  \
0      Armenian Genocide Remembrance Day  2025-06-25T04:50:21Z   
1                       Black Ribbon Day  2025-09-16T06:47:57Z   
2             Circassian Day of Mourning  2025-04-09T16:55:18Z   
3  Genocide Remembrance Day (Bangladesh)  2025-07-07T22:15:07Z   
4                 Holodomor Memorial Day  2024-10-3

In [39]:
df_meta

Unnamed: 0,article,creation_date,categories,subcategories,event_date,country,continent
0,Armenian Genocide Remembrance Day,2025-06-25T04:50:21Z,"[Category:April observances, Category:Armenian...","[Category:April Fools' Day, Category:Earth Day...",April 24,,
1,Black Ribbon Day,2025-09-16T06:47:57Z,"[Category:1986 establishments in Canada, Categ...","[Category:1986 establishments in Alberta, Cate...",August 23,,
2,Circassian Day of Mourning,2025-04-09T16:55:18Z,"[Category:1990 establishments in Russia, Categ...",[Category:Railway stations in Russia opened in...,,,
3,Genocide Remembrance Day (Bangladesh),2025-07-07T22:15:07Z,"[Category:1971 Bangladesh genocide, Category:A...",[Category:1971 Bangladesh genocide perpetrator...,March 25,,
4,Holodomor Memorial Day,2024-10-31T01:04:31Z,"[Category:1998 establishments in Ukraine, Cate...",[Category:Ukrainian companies established in 1...,fourth Saturday in November,Ukraine,Europe
5,International Day of Reflection on the 1994 Rw...,2023-05-11T10:32:53Z,"[Category:April observances, Category:Genocide...","[Category:April Fools' Day, Category:Earth Day...",April 7,,
6,Martyred Intellectuals Day,2025-07-11T01:11:00Z,[Category:1971 killing of Bengali intellectual...,[Category:Short description is different from ...,December 14,,
7,National Day for Truth and Reconciliation,2025-09-11T13:04:05Z,"[Category:2013 establishments in Canada, Categ...",[Category:Canadian companies established in 20...,September 30,Canada,North America
8,National Day of Remembrance (Cambodia),2025-08-06T09:22:46Z,"[Category:Aftermath of the Cambodian genocide,...","[Category:Cambodian–Vietnamese War, Category:K...",May 20,Cambodia,Asia
9,National Sorry Day,2025-08-11T06:41:50Z,[Category:All Wikipedia articles written in Au...,[Category:Short description is different from ...,May 26,Australia,Oceania


In [41]:
df_meta = df_meta[['article', 'event_date']]

In [430]:
df_meta = pd.read_csv("df_meta.csv")

In [431]:
df_articles = df_articles.rename(columns = { "title" : "article"})

In [422]:
df_meta= pd.merge(df_meta, df_articles, how = "right", on= ["article"])

In [417]:
################
df_meta = df_articles.merge(
    df_meta,
    on="article",  
    how="left"     
)

In [435]:
cols_to_add = ["article", "country", "continent", "categories", "subcategories", "event_date", "country", "continent"]
df_merged = df_articles.merge(
    df_meta[cols_to_add],
    on="article",
    how="left"
)


In [436]:
df_merged

Unnamed: 0,pageid,article,Categorie,categories_x,subcategories_x,event_date_x,country_x,continent_x,event_windows_x,categories_y,...,country_y,continent_y,event_windows_y,country,continent,categories,subcategories,event_date,country.1,continent.1
0,4575104,Armenian Genocide Remembrance Day,Genocide,"['Category:April observances', 'Category:Armen...","[""Category:April Fools' Day"", 'Category:Earth ...",April 24,,,"{2015: ['2015-04-21', '2015-04-27'], 2016: ['2...","['Category:April observances', 'Category:Armen...",...,,,"{2015: ['2015-04-21', '2015-04-27'], 2016: ['2...",,,"['Category:April observances', 'Category:Armen...","[""Category:April Fools' Day"", 'Category:Earth ...",April 24,,
1,25966474,Black Ribbon Day,Genocide,"['Category:1986 establishments in Canada', 'Ca...","['Category:1986 establishments in Alberta', 'C...",August 23,,,"{2015: ['2015-08-20', '2015-08-26'], 2016: ['2...","['Category:1986 establishments in Canada', 'Ca...",...,,,"{2015: ['2015-08-20', '2015-08-26'], 2016: ['2...",,,"['Category:1986 establishments in Canada', 'Ca...","['Category:1986 establishments in Alberta', 'C...",August 23,,
2,17539056,Circassian Day of Mourning,Genocide,,,,,,,,...,,,,,,"['Category:1990 establishments in Russia', 'Ca...",['Category:Railway stations in Russia opened i...,,,
3,53453368,Genocide Remembrance Day (Bangladesh),Genocide,"['Category:1971 Bangladesh genocide', 'Categor...",['Category:1971 Bangladesh genocide perpetrato...,March 25,,,"{2015: ['2015-03-22', '2015-03-28'], 2016: ['2...","['Category:1971 Bangladesh genocide', 'Categor...",...,,,"{2015: ['2015-03-22', '2015-03-28'], 2016: ['2...",,,"['Category:1971 Bangladesh genocide', 'Categor...",['Category:1971 Bangladesh genocide perpetrato...,March 25,,
4,69380022,Holodomor Memorial Day,Genocide,"['Category:1998 establishments in Ukraine', 'C...",['Category:Ukrainian companies established in ...,fourth Saturday in November,Ukraine,Europe,,"['Category:1998 establishments in Ukraine', 'C...",...,Ukraine,Europe,,Ukraine,Europe,"['Category:1998 establishments in Ukraine', 'C...",['Category:Ukrainian companies established in ...,fourth Saturday in November,Ukraine,Europe
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,44244013,Victory Day (Maldives),Independence days,,,,,,,,...,,,,,,"['Category:Annual events in the Maldives', 'Ca...",['Category:Short description is different from...,,,
95,215257,Victory in Europe Day,Independence days,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",+1945-05-08T00:00:00Z,France,Europe,,"['Category:1945 in Europe', 'Category:All arti...",...,France,Europe,,France,Europe,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",+1945-05-08T00:00:00Z,France,Europe
96,57115772,Victory in Europe Day (Israel),Independence days,,,,,,,,...,,,,Israel,Asia,['Category:All articles with unsourced stateme...,['Category:Articles containing Biblical Hebrew...,,Israel,Asia
97,17678597,Võidupüha,Independence days,['Category:All articles needing additional ref...,['Category:New York City public transportation...,June 23,Estonia,Europe,"{2015: ['2015-06-20', '2015-06-26'], 2016: ['2...",['Category:All articles needing additional ref...,...,Estonia,Europe,"{2015: ['2015-06-20', '2015-06-26'], 2016: ['2...",Estonia,Europe,['Category:All articles needing additional ref...,['Category:New York City public transportation...,June 23,Estonia,Europe


In [390]:
df_meta = df_meta.drop('creation_date', axis=1)


In [437]:
df_meta = df_meta.dropna(subset=['event_date'])

In [43]:
df_meta

Unnamed: 0,article,event_date,event_windows
0,Armenian Genocide Remembrance Day,April 24,"{2016: ['2016-04-21', '2016-04-27'], 2017: ['2..."
1,Black Ribbon Day,August 23,"{2016: ['2016-08-20', '2016-08-26'], 2017: ['2..."
2,Circassian Day of Mourning,,
3,Genocide Remembrance Day (Bangladesh),March 25,"{2016: ['2016-03-22', '2016-03-28'], 2017: ['2..."
4,Holodomor Memorial Day,fourth Saturday in November,
5,International Day of Reflection on the 1994 Rw...,April 7,"{2016: ['2016-04-04', '2016-04-10'], 2017: ['2..."
6,Martyred Intellectuals Day,December 14,"{2016: ['2016-12-11', '2016-12-17'], 2017: ['2..."
7,National Day for Truth and Reconciliation,September 30,"{2016: ['2016-09-27', '2016-10-03'], 2017: ['2..."
8,National Day of Remembrance (Cambodia),May 20,"{2016: ['2016-05-17', '2016-05-23'], 2017: ['2..."
9,National Sorry Day,May 26,"{2016: ['2016-05-23', '2016-05-29'], 2017: ['2..."


In [438]:
page_views["article"] = page_views["article"].str.replace('_', ' ')

<h1 style="font-size:-large">Creating a 7-day Window for the Event</h1>

In [42]:
def make_window(event_date_str, years, window=3):
    try:
        # Try parsing as fixed day/month (e.g., "April 24")
        base = pd.to_datetime(event_date_str + " 2000", format="%B %d %Y")
        windows = {
            year: [
                (base.replace(year=year) - timedelta(days=window)).strftime("%Y-%m-%d"),
                (base.replace(year=year) + timedelta(days=window)).strftime("%Y-%m-%d"),
            ]
            for year in years
        }
        return windows
    except Exception:
        # Handle "movable" holidays separately later
        return None

# Example: generate windows for 2015–2025
years = range(2016, 2026)
df_meta["event_windows"] = df_meta["event_date"].apply(lambda d: make_window(d, years))

print(df_meta[["article", "event_date", "event_windows"]])

                                              article  \
0                   Armenian Genocide Remembrance Day   
1                                    Black Ribbon Day   
2                          Circassian Day of Mourning   
3               Genocide Remembrance Day (Bangladesh)   
4                              Holodomor Memorial Day   
5   International Day of Reflection on the 1994 Rw...   
6                          Martyred Intellectuals Day   
7           National Day for Truth and Reconciliation   
8              National Day of Remembrance (Cambodia)   
9                                  National Sorry Day   
10                                  White Armband Day   

                     event_date  \
0                      April 24   
1                     August 23   
2                           NaN   
3                      March 25   
4   fourth Saturday in November   
5                       April 7   
6                   December 14   
7                  September 30   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_meta["event_windows"] = df_meta["event_date"].apply(lambda d: make_window(d, years))


<h1 style="font-size:-large">Merging Dataframes</h1>

In [441]:
merged = page_views.merge(
    df_meta,
    on="article",  
    how="left"      # keep all page_views rows
)

In [443]:
#######################
#different try 
merged = page_views.merge(
    df_merged,
    on="article",  
    how="left"      # keep all page_views rows
)

In [444]:
merged

Unnamed: 0,date,views,article,pageid,Categorie,categories_x,subcategories_x,event_date_x,country_x,continent_x,...,country_y,continent_y,event_windows_y,country,continent,categories,subcategories,event_date,country.1,continent.1
0,2016-01-01,205,Liberation Day,1106409,Independence days,,,,,,...,,,,,,"['Category:April observances', 'Category:Artic...","[""Category:April Fools' Day"", 'Category:Earth ...",,,
1,2016-01-01,205,Liberation Day,1106409,Independence days,,,,,,...,,,,,,"['Category:April observances', 'Category:Artic...","[""Category:April Fools' Day"", 'Category:Earth ...",,,
2,2016-01-02,111,Liberation Day,1106409,Independence days,,,,,,...,,,,,,"['Category:April observances', 'Category:Artic...","[""Category:April Fools' Day"", 'Category:Earth ...",,,
3,2016-01-02,111,Liberation Day,1106409,Independence days,,,,,,...,,,,,,"['Category:April observances', 'Category:Artic...","[""Category:April Fools' Day"", 'Category:Earth ...",,,
4,2016-01-03,82,Liberation Day,1106409,Independence days,,,,,,...,,,,,,"['Category:April observances', 'Category:Artic...","[""Category:April Fools' Day"", 'Category:Earth ...",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179106,2024-12-30,9,Waterloo Day,1640108,Independence days,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe,...,United Kingdom of the Netherlands,Europe,"{2015: ['2015-06-15', '2015-06-21'], 2016: ['2...",United Kingdom of the Netherlands,Europe,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe
179107,2024-12-31,8,Waterloo Day,1640108,Independence days,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe,...,United Kingdom of the Netherlands,Europe,"{2015: ['2015-06-15', '2015-06-21'], 2016: ['2...",United Kingdom of the Netherlands,Europe,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe
179108,2024-12-31,8,Waterloo Day,1640108,Independence days,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe,...,United Kingdom of the Netherlands,Europe,"{2015: ['2015-06-15', '2015-06-21'], 2016: ['2...",United Kingdom of the Netherlands,Europe,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe
179109,2025-01-01,2,Waterloo Day,1640108,Independence days,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe,...,United Kingdom of the Netherlands,Europe,"{2015: ['2015-06-15', '2015-06-21'], 2016: ['2...",United Kingdom of the Netherlands,Europe,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe


In [404]:
merged

Unnamed: 0,date,views,article,pageid,Categorie,categories,subcategories,event_date,country,continent,event_windows
0,2016-01-01,205,Liberation Day,1106409,Independence days,,,,,,
1,2016-01-01,205,Liberation Day,1106409,Independence days,,,,,,
2,2016-01-02,111,Liberation Day,1106409,Independence days,,,,,,
3,2016-01-02,111,Liberation Day,1106409,Independence days,,,,,,
4,2016-01-03,82,Liberation Day,1106409,Independence days,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
179106,2024-12-30,9,Waterloo Day,1640108,Independence days,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe,"{2015: ['2015-06-15', '2015-06-21'], 2016: ['2..."
179107,2024-12-31,8,Waterloo Day,1640108,Independence days,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe,"{2015: ['2015-06-15', '2015-06-21'], 2016: ['2..."
179108,2024-12-31,8,Waterloo Day,1640108,Independence days,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe,"{2015: ['2015-06-15', '2015-06-21'], 2016: ['2..."
179109,2025-01-01,2,Waterloo Day,1640108,Independence days,"['Category:All stub articles', 'Category:Annua...","['Category:Long stubs with short prose', 'Cate...",June 18,United Kingdom of the Netherlands,Europe,"{2015: ['2015-06-15', '2015-06-21'], 2016: ['2..."


In [182]:
page_views.duplicated(subset=["article", "date"]).sum()


np.int64(0)

<h1 style="font-size:-large">Creating Boolean on whether Pageview is in event window or not</h1>

In [53]:
merged.to_csv("final_df.csv")

In [905]:
merged = pd.read_csv("final_df.csv")

<h1 style="font-size:-large">Setting threshold (original paper kept only top 11 sources)</h1>

In [226]:
merged[merged["article"] == "Armenian Genocide Remembrance Day"].shape[0]


3289

In [405]:
# Count number of rows per article
counts = merged["article"].value_counts()

# Get articles with exactly 3289 rows
valid_articles = counts[counts == 3289].index

# Filter the dataframe
merged = merged[merged["article"].isin(valid_articles)]


In [406]:
merged

Unnamed: 0,date,views,article,pageid,Categorie,categories,subcategories,event_date,country,continent,event_windows


In [243]:
# Calculate average daily views per article
avg_views = merged.groupby('article')['views'].mean()

threshold = avg_views.quantile(0.3)  
high_traffic_articles = avg_views[avg_views >= threshold].index

# Filter  dataset
df_filtered = merged[merged['article'].isin(high_traffic_articles)]

In [264]:
# Compute average views per article
avg_views = merged.groupby("article")["views"].mean()

# Keep only articles with avg >= 100
articles_to_keep = avg_views[avg_views >= 100].index
df_filtered = merged[merged["article"].isin(articles_to_keep)]


In [275]:
exclude_articles = [
    'List of countries that have gained independence from the United Kingdom',
    'List of national independence days', 'Liberation Day'
]

df_filtered = df_filtered[~df_filtered["article"].isin(exclude_articles)]

In [276]:
df_filtered

Unnamed: 0,date,views,article,categories,subcategories,event_date,country,continent,pageid,links,event_windows
9853,2016-01-01,1363,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
9854,2016-01-02,1010,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
9855,2016-01-03,1066,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
9856,2016-01-04,1193,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
9857,2016-01-05,1251,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
...,...,...,...,...,...,...,...,...,...,...,...
747128,2024-12-28,1178,Victory in Europe Day,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,215257.0,"[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-05', '2015-05-11'], 2016: ['2..."
747129,2024-12-29,1113,Victory in Europe Day,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,215257.0,"[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-05', '2015-05-11'], 2016: ['2..."
747130,2024-12-30,1205,Victory in Europe Day,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,215257.0,"[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-05', '2015-05-11'], 2016: ['2..."
747131,2024-12-31,1083,Victory in Europe Day,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,215257.0,"[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-05', '2015-05-11'], 2016: ['2..."


In [239]:
pd.set_option('display.max_rows', 60)    # Reset to default (or any number)


In [250]:
pd.set_option('display.max_rows', 10)    # Reset to default (or any number)


<h1 style="font-size:-large">Creating Source Target Pairs</h1>

In [251]:
# Create source-target pairs
source_events = df_filtered['article'].unique()  # All memorial days
for source in source_events:
    targets = [day for day in source_events if day != source]  # All others except self


In [252]:
pairs = [(source, target) for source in source_events 
                            for target in source_events if target != source]

In [277]:
df_filtered

Unnamed: 0,date,views,article,categories,subcategories,event_date,country,continent,pageid,links,event_windows
9853,2016-01-01,1363,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
9854,2016-01-02,1010,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
9855,2016-01-03,1066,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
9856,2016-01-04,1193,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
9857,2016-01-05,1251,Cinco de Mayo,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,26482586.0,"[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-05-02', '2015-05-08'], 2016: ['2..."
...,...,...,...,...,...,...,...,...,...,...,...
747128,2024-12-28,1178,Victory in Europe Day,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,215257.0,"[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-05', '2015-05-11'], 2016: ['2..."
747129,2024-12-29,1113,Victory in Europe Day,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,215257.0,"[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-05', '2015-05-11'], 2016: ['2..."
747130,2024-12-30,1205,Victory in Europe Day,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,215257.0,"[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-05', '2015-05-11'], 2016: ['2..."
747131,2024-12-31,1083,Victory in Europe Day,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,215257.0,"[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-05', '2015-05-11'], 2016: ['2..."


In [44]:
df_filtered = df_meta

<h1 style="font-size:-large">Mapping missing event days</h1>

In [53]:
df_filtered.loc[df_filtered["article"] == "Nakba Day", "event_date"] = "May 15"
df_filtered.loc[df_filtered["article"] == "Celebration of the Greek Revolution", "event_date"] = "March 25"
df_filtered.loc[df_filtered["article"] == "Grito de Lares", "event_date"] = "September 23"
df_filtered.loc[df_filtered["article"] == "Victory over Japan Day", "event_date"] = "August 15"
df_filtered.loc[df_filtered["article"] == "Victory in Europe Day", "event_date"] = "May 8"
df_filtered.loc[df_filtered["article"] == "Remembrance Sunday", "event_date"] = "November 9"
df_filtered.loc[df_filtered["article"] == "Victory Day", "event_date"] = "May 9"
df_filtered.loc[df_filtered["article"] == "Circassian Day of Mourning", "event_date"] = "May 21"
df_filtered.loc[df_filtered["article"] == "Circassian Day of Mourning", "event_date"] = "May 21"








In [None]:
#print(df_filtered[df_filtered['event_windows'].isnull()])
#df_filtered = df_filtered.dropna(subset = ['event_windows'], inplace=True)


        Unnamed: 0                article         creation_date  \
317163      317163            Yom HaShoah  2004-03-19T07:08:56Z   
317164      317164            Yom HaShoah  2004-03-19T07:08:56Z   
317165      317165            Yom HaShoah  2004-03-19T07:08:56Z   
317166      317166            Yom HaShoah  2004-03-19T07:08:56Z   
317167      317167            Yom HaShoah  2004-03-19T07:08:56Z   
...            ...                    ...                   ...   
781117      781117  Victory in Europe Day  2003-04-24T03:52:57Z   
781118      781118  Victory in Europe Day  2003-04-24T03:52:57Z   
781119      781119  Victory in Europe Day  2003-04-24T03:52:57Z   
781120      781120  Victory in Europe Day  2003-04-24T03:52:57Z   
781121      781121  Victory in Europe Day  2003-04-24T03:52:57Z   

                                               categories  \
317163  ['Category:Articles containing Hebrew-language...   
317164  ['Category:Articles containing Hebrew-language...   
317165  ['Ca

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered = df_filtered.dropna(subset = ['event_windows'], inplace=True)


In [54]:
def make_window(event_date_str, years, window=3):
    try:
        # Try parsing as fixed day/month (e.g., "April 24")
        base = pd.to_datetime(event_date_str + " 2000", format="%B %d %Y")
        windows = {
            year: [
                (base.replace(year=year) - timedelta(days=window)).strftime("%Y-%m-%d"),
                (base.replace(year=year) + timedelta(days=window)).strftime("%Y-%m-%d"),
            ]
            for year in years
        }
        return windows
    except Exception:
        # Handle "movable" holidays separately later
        return None

# Example: generate windows for 2015–2025
years = range(2015, 2025)
df_filtered["event_windows"] = df_filtered["event_date"].apply(lambda d: make_window(d, years))

print(df_filtered[["article", "event_date", "event_windows"]])

                                              article  \
0                   Armenian Genocide Remembrance Day   
1                                    Black Ribbon Day   
2                          Circassian Day of Mourning   
3               Genocide Remembrance Day (Bangladesh)   
4                              Holodomor Memorial Day   
5   International Day of Reflection on the 1994 Rw...   
6                          Martyred Intellectuals Day   
7           National Day for Truth and Reconciliation   
8              National Day of Remembrance (Cambodia)   
9                                  National Sorry Day   
10                                  White Armband Day   

                     event_date  \
0                      April 24   
1                     August 23   
2                        May 21   
3                      March 25   
4   fourth Saturday in November   
5                       April 7   
6                   December 14   
7                  September 30   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered["event_windows"] = df_filtered["event_date"].apply(lambda d: make_window(d, years))


In [55]:
jerusalem_day_windows = {
    2015: ['2015-05-21', '2015-05-21'],
    2016: ['2016-06-05', '2016-06-05'],
    2017: ['2017-05-29', '2017-05-29'],
    2018: ['2018-05-20', '2018-05-20'],
    2019: ['2019-05-12', '2019-05-12'],
    2020: ['2020-05-28', '2020-05-28'],
    2021: ['2021-05-10', '2021-05-10'],
    2022: ['2022-05-29', '2022-05-29'],
    2023: ['2023-05-18', '2023-05-18'],
    2024: ['2024-06-04', '2024-06-04'],
    2025: ['2025-05-27', '2025-05-27']
}

# Yom HaShoah (27 Nisan)
yom_hashoah_windows = {
    2015: ['2015-04-20', '2015-04-26'],
    2016: ['2016-05-04', '2016-05-10'],
    2017: ['2017-04-24', '2017-04-30'],
    2018: ['2018-04-11', '2018-04-17'],
    2019: ['2019-05-01', '2019-05-07'],
    2020: ['2020-04-19', '2020-04-25'],
    2021: ['2021-04-07', '2021-04-13'],
    2022: ['2022-04-27', '2022-05-03'],
    2023: ['2023-04-16', '2023-04-22'],
    2024: ['2024-05-05', '2024-05-11'],
    2025: ['2025-04-22', '2025-04-28']
}

# Memorial Day (last Monday in May)
memorial_day_windows = {
    2015: ['2015-05-25', '2015-05-25'],
    2016: ['2016-05-30', '2016-05-30'],
    2017: ['2017-05-29', '2017-05-29'],
    2018: ['2018-05-28', '2018-05-28'],
    2019: ['2019-05-27', '2019-05-27'],
    2020: ['2020-05-25', '2020-05-25'],
    2021: ['2021-05-31', '2021-05-31'],
    2022: ['2022-05-30', '2022-05-30'],
    2023: ['2023-05-29', '2023-05-29'],
    2024: ['2024-05-27', '2024-05-27'],
    2025: ['2025-05-26', '2025-05-26']
}

# Independence Day (Israel, 5 Iyar)
independence_day_windows = {
    2015: ['2015-04-22', '2015-04-22'],
    2016: ['2016-05-12', '2016-05-12'],
    2017: ['2017-04-25', '2017-04-25'],
    2018: ['2018-04-19', '2018-04-19'],
    2019: ['2019-04-29', '2019-04-29'],
    2020: ['2020-04-28', '2020-04-28'],
    2021: ['2021-04-15', '2021-04-15'],
    2022: ['2022-04-26', '2022-04-26'],
    2023: ['2023-04-26', '2023-04-26'],
    2024: ['2024-05-14', '2024-05-14'],
    2025: ['2025-04-22', '2025-04-22']
}

# Holodomor Memorial Day - fourth Saturday in November
holodomor_memorial_windows = {
    2015: ['2015-11-28', '2015-11-28'],
    2016: ['2016-11-26', '2016-11-26'],
    2017: ['2017-11-25', '2017-11-25'],
    2018: ['2018-11-24', '2018-11-24'],
    2019: ['2019-11-23', '2019-11-23'],
    2020: ['2020-11-28', '2020-11-28'],
    2021: ['2021-11-27', '2021-11-27'],
    2022: ['2022-11-26', '2022-11-26'],
    2023: ['2023-11-25', '2023-11-25'],
    2024: ['2024-11-23', '2024-11-23'],
    2025: ['2025-11-22', '2025-11-22']
}
# Combine into a dictionary for easy looping
manual_windows = {
    "Jerusalem Day": jerusalem_day_windows,
    "Yom HaShoah": yom_hashoah_windows,
    "Memorial Day": memorial_day_windows, 
    "Independence Day (Israel)": independence_day_windows,
    "Holodomor Memorial Day" : holodomor_memorial_windows

}




for article, window_dict in manual_windows.items():
    mask = df_filtered["article"] == article
    df_filtered.loc[mask, "event_windows"] = [window_dict] * mask.sum()


print(df_filtered)

                                              article  \
0                   Armenian Genocide Remembrance Day   
1                                    Black Ribbon Day   
2                          Circassian Day of Mourning   
3               Genocide Remembrance Day (Bangladesh)   
4                              Holodomor Memorial Day   
5   International Day of Reflection on the 1994 Rw...   
6                          Martyred Intellectuals Day   
7           National Day for Truth and Reconciliation   
8              National Day of Remembrance (Cambodia)   
9                                  National Sorry Day   
10                                  White Armband Day   

                     event_date  \
0                      April 24   
1                     August 23   
2                        May 21   
3                      March 25   
4   fourth Saturday in November   
5                       April 7   
6                   December 14   
7                  September 30   


In [60]:
df_meta

Unnamed: 0,article,event_date,event_windows
0,Armenian Genocide Remembrance Day,April 24,"{2015: ['2015-04-21', '2015-04-27'], 2016: ['2..."
1,Black Ribbon Day,August 23,"{2015: ['2015-08-20', '2015-08-26'], 2016: ['2..."
2,Circassian Day of Mourning,May 21,"{2015: ['2015-05-18', '2015-05-24'], 2016: ['2..."
3,Genocide Remembrance Day (Bangladesh),March 25,"{2015: ['2015-03-22', '2015-03-28'], 2016: ['2..."
4,Holodomor Memorial Day,fourth Saturday in November,"{2015: ['2015-11-28', '2015-11-28'], 2016: ['2..."
5,International Day of Reflection on the 1994 Rw...,April 7,"{2015: ['2015-04-04', '2015-04-10'], 2016: ['2..."
6,Martyred Intellectuals Day,December 14,"{2015: ['2015-12-11', '2015-12-17'], 2016: ['2..."
7,National Day for Truth and Reconciliation,September 30,"{2015: ['2015-09-27', '2015-10-03'], 2016: ['2..."
8,National Day of Remembrance (Cambodia),May 20,"{2015: ['2015-05-17', '2015-05-23'], 2016: ['2..."
9,National Sorry Day,May 26,"{2015: ['2015-05-23', '2015-05-29'], 2016: ['2..."


In [58]:
df_filtered.to_csv("newnew_try.csv", index = False)

Pairing Source and Target Articles with event window


In [283]:
# Prepare articles with all needed columns
articles = df_filtered[[
    "article", "event_windows", "subcategories", "categories", "links"
]].drop_duplicates(subset="article")

# Build all source-target pairs (excluding self-pairs)
pairs = pd.DataFrame([
    (src, tgt)
    for src in articles["article"]
    for tgt in articles["article"]
    if src != tgt
], columns=["source", "target"])

# Merge source info
pairs = pairs.merge(
    articles.rename(columns={
        "article": "source",
        "event_windows": "source_window",
        "subcategories": "source_subcategory",
        "categories": "source_category",
        "links": "source_links"
    }),
    on="source",
    how="left"
)

# Merge target info
pairs = pairs.merge(
    articles.rename(columns={
        "article": "target",
        "event_windows": "target_window",
        "subcategories": "target_subcategory",
        "categories": "target_category",
        "links": "target_links"
    }),
    on="target",
    how="left"
)

print(pairs.head())




          source                      target  \
0  Cinco de Mayo  Freedom Day (South Africa)   
1  Cinco de Mayo               Jerusalem Day   
2  Cinco de Mayo                   Nakba Day   
3  Cinco de Mayo        National Women's Day   
4  Cinco de Mayo       Republic Day (Turkey)   

                                       source_window  \
0  {2015: ['2015-05-02', '2015-05-08'], 2016: ['2...   
1  {2015: ['2015-05-02', '2015-05-08'], 2016: ['2...   
2  {2015: ['2015-05-02', '2015-05-08'], 2016: ['2...   
3  {2015: ['2015-05-02', '2015-05-08'], 2016: ['2...   
4  {2015: ['2015-05-02', '2015-05-08'], 2016: ['2...   

                                  source_subcategory  \
0  ['Category:Short description is different from...   
1  ['Category:Short description is different from...   
2  ['Category:Short description is different from...   
3  ['Category:Short description is different from...   
4  ['Category:Short description is different from...   

                                     

In [284]:
def days_between_windows(source_window, target_window, year):
    """Compute min difference in days between two windows for a given year."""
    s_start = datetime.strptime(source_window[year][0], "%Y-%m-%d")
    s_end = datetime.strptime(source_window[year][1], "%Y-%m-%d")
    t_start = datetime.strptime(target_window[year][0], "%Y-%m-%d")
    t_end = datetime.strptime(target_window[year][1], "%Y-%m-%d")
    
    # All possible combinations of start/end dates
    diffs = [
        abs((s_start - t_start).days),
        abs((s_start - t_end).days),
        abs((s_end - t_start).days),
        abs((s_end - t_end).days)
    ]
    return min(diffs)

def min_days_between_all_years(source_window, target_window):
    years = set(source_window.keys()) & set(target_window.keys())
    min_diff = float('inf')
    for y in years:
        diff = days_between_windows(source_window, target_window, y)
        if diff < min_diff:
            min_diff = diff
    return min_diff

pairs["min_day_diff"] = pairs.apply(
    lambda row: min_days_between_all_years(row["source_window"], row["target_window"]),
    axis=1
)

pairs_filtered = pairs[pairs["min_day_diff"] > 10]


In [285]:
pairs_filtered

Unnamed: 0,source,target,source_window,source_subcategory,source_category,source_links,target_window,target_subcategory,target_category,target_links,min_day_diff
3,Cinco de Mayo,National Women's Day,"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-08-06', '2015-08-12'], 2016: ['2...",['Category:Short description is different from...,['Category:All Wikipedia articles written in S...,"[Afrikaans language, Apartheid era, Boxing Day...",90
4,Cinco de Mayo,Republic Day (Turkey),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-10-25', '2015-10-31'], 2016: ['2...",['Category:Short description is different from...,['Category:All articles with unsourced stateme...,"[100th Anniversary of the Republic of Turkey, ...",170
5,Cinco de Mayo,Waitangi Day,"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-02-03', '2015-02-09'], 2016: ['2...",['Category:New Zealand companies established i...,['Category:1934 establishments in New Zealand'...,"[Air New Zealand, Anzac Day, Araucaria araucan...",82
6,Cinco de Mayo,National Day of the People's Republic of China,"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-09-28', '2015-10-04'], 2016: ['2...",['Category:Articles containing simplified Chin...,['Category:Articles containing Chinese-languag...,[100th Anniversary of the Chinese Communist Pa...,143
7,Cinco de Mayo,National Day of the Republic of China,"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-10-07', '2015-10-13'], 2016: ['2...","['Category:People of the 1911 Revolution', 'Ca...","['Category:1911 Revolution', 'Category:All art...","[100th Anniversary of the Republic of China, 1...",152
...,...,...,...,...,...,...,...,...,...,...,...
2062,Victory in Europe Day,Great Union Day,"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-11-28', '2015-12-04'], 2016: ['2...",['Category:Railway stations in Romania opened ...,"['Category:1990 establishments in Romania', 'C...","[1918 Romanian National Assembly election, 194...",201
2063,Victory in Europe Day,Unity Day (Russia),"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-11-01', '2015-11-07'], 2016: ['2...",['Category:Short description is different from...,['Category:Articles containing Russian-languag...,"[1612 (film), Christmas in Russia, Communist P...",174
2065,Victory in Europe Day,Veterans Day,"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-11-08', '2015-11-14'], 2016: ['2...","['Category:1919 establishments in Alabama', 'C...",['Category:1919 establishments in the United S...,"[420 (cannabis culture), Advent Sunday, Alabam...",181
2066,Victory in Europe Day,Kargil Vijay Diwas,"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-07-23', '2015-07-29'], 2016: ['2...","['Category:Annual sporting events in India', '...",['Category:All Wikipedia articles written in I...,"[Allahabad, Amar Jawan Jyoti, ISBN (identifier...",73


In [286]:
def get_total_pageviews(start, end):
    url = (
        f"https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/"
        f"en.wikipedia/all-access/all-agents/daily/{start}/{end}"
    )
    headers = {"User-Agent": "MyResearchBot/1.0 (your_email@example.com)"}
    r = requests.get(url, headers=headers)
    data = r.json()
    views = [
        {"date": item["timestamp"][:8], "total_views": item["views"]}
        for item in data["items"]
    ]
    df = pd.DataFrame(views)
    df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
    return df




In [287]:
# get total Wikipedia traffic for the same period
total_df = get_total_pageviews("20150701", "20250109")

In [944]:
# Convert df_filtered['date'] to datetime
df_filtered['date'] = pd.to_datetime(df_filtered['date'])

# Make sure total_df also has datetime
total_df['date'] = pd.to_datetime(total_df['date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['date'] = pd.to_datetime(df_filtered['date'])


In [945]:
df_filtered

Unnamed: 0.1,Unnamed: 0,article,creation_date,categories,subcategories,event_date,country,continent,event_windows,date,views,pageid,links,is_event_window
10421,10421,Cinco de Mayo,2001-07-24T05:20:12Z,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",2015-07-01,2278.0,26482586,"['420 (cannabis culture)', 'Abbotsford, Britis...",False
10422,10422,Cinco de Mayo,2001-07-24T05:20:12Z,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",2015-07-02,3179.0,26482586,"['420 (cannabis culture)', 'Abbotsford, Britis...",False
10423,10423,Cinco de Mayo,2001-07-24T05:20:12Z,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",2015-07-03,3860.0,26482586,"['420 (cannabis culture)', 'Abbotsford, Britis...",False
10424,10424,Cinco de Mayo,2001-07-24T05:20:12Z,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",2015-07-04,6243.0,26482586,"['420 (cannabis culture)', 'Abbotsford, Britis...",False
10425,10425,Cinco de Mayo,2001-07-24T05:20:12Z,"['Category:1862 establishments in Mexico', 'Ca...",['Category:Short description is different from...,May 5,Mexico,North America,"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",2015-07-05,4723.0,26482586,"['420 (cannabis culture)', 'Abbotsford, Britis...",False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781117,781117,Victory in Europe Day,2003-04-24T03:52:57Z,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...",2025-01-03,1067.0,215257,"['1985 Warsaw Victory Day Parade', '2005 Mosco...",False
781118,781118,Victory in Europe Day,2003-04-24T03:52:57Z,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...",2025-01-04,1077.0,215257,"['1985 Warsaw Victory Day Parade', '2005 Mosco...",False
781119,781119,Victory in Europe Day,2003-04-24T03:52:57Z,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...",2025-01-05,1225.0,215257,"['1985 Warsaw Victory Day Parade', '2005 Mosco...",False
781120,781120,Victory in Europe Day,2003-04-24T03:52:57Z,"['Category:1945 in Europe', 'Category:All arti...","['Category:1945 crimes in Europe', 'Category:1...",May 8,France,Europe,"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...",2025-01-06,1115.0,215257,"['1985 Warsaw Victory Day Parade', '2005 Mosco...",False


In [288]:
# merge on date
df_filtered = df_filtered.merge(total_df, on="date", how="left")

# normalize
df_filtered["normalized_views"] = df_filtered["views"] / df_filtered["total_views"]



In [289]:
df_filtered["date"] = pd.to_datetime(df_filtered["date"])

# Extract year
df_filtered["year"] = df_filtered["date"].dt.year

In [298]:
#target_views = df_filtered[['article', 'views', 'date', 'normalized_views', 'baseline_norm']]
target_views = df_filtered[['article', 'views', 'date', 'normalized_views', ]]


In [299]:
target_views = target_views.rename(columns = {"article" : "target"})

In [None]:
#pairs_df_new = pairs_filtered.merge(
    target_views[['target', 'views', 'date', 'normalized_views', 'baseline_norm']],
    on='target',
    how='left'
)

In [300]:
pairs_df_new = pairs.merge(
    target_views,
    on="target",
    how="left"
)

In [294]:
pairs_df_new = pairs_df_new.rename(columns = {"event_windows" : "source_event_windows" })

In [295]:
pairs_df_new = pd.read_csv("complete_pairs_df.csv")

In [517]:
pairs_df_new = pd.read_csv("complete_pairs_df.csv")

In [301]:
pairs_df_new


Unnamed: 0,source,target,source_window,source_subcategory,source_category,source_links,target_window,target_subcategory,target_category,target_links,min_day_diff,views,date,normalized_views
0,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,77,2016-01-01,2.858294e-07
1,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,46,2016-01-02,1.570376e-07
2,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,39,2016-01-03,1.302175e-07
3,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,48,2016-01-04,1.569148e-07
4,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,48,2016-01-05,1.598415e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6808225,Victory in Europe Day,Victory Day (9 May),"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-06', '2015-05-12'], 2016: ['2...","['Category:Amerika Häuser', 'Category:Ethnic c...","['Category:2015 disestablishments in Ukraine',...","[150th Rifle Division, 154th Preobrazhensky In...",1,477,2024-12-28,1.256349e-06
6808226,Victory in Europe Day,Victory Day (9 May),"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-06', '2015-05-12'], 2016: ['2...","['Category:Amerika Häuser', 'Category:Ethnic c...","['Category:2015 disestablishments in Ukraine',...","[150th Rifle Division, 154th Preobrazhensky In...",1,455,2024-12-29,1.146698e-06
6808227,Victory in Europe Day,Victory Day (9 May),"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-06', '2015-05-12'], 2016: ['2...","['Category:Amerika Häuser', 'Category:Ethnic c...","['Category:2015 disestablishments in Ukraine',...","[150th Rifle Division, 154th Preobrazhensky In...",1,474,2024-12-30,1.205486e-06
6808228,Victory in Europe Day,Victory Day (9 May),"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-06', '2015-05-12'], 2016: ['2...","['Category:Amerika Häuser', 'Category:Ethnic c...","['Category:2015 disestablishments in Ukraine',...","[150th Rifle Division, 154th Preobrazhensky In...",1,450,2024-12-31,1.224486e-06


In [575]:
test_row = pairs_df_new[:10002]

In [302]:
pairs_df_new

Unnamed: 0,source,target,source_window,source_subcategory,source_category,source_links,target_window,target_subcategory,target_category,target_links,min_day_diff,views,date,normalized_views
0,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,77,2016-01-01,2.858294e-07
1,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,46,2016-01-02,1.570376e-07
2,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,39,2016-01-03,1.302175e-07
3,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,48,2016-01-04,1.569148e-07
4,Cinco de Mayo,Freedom Day (South Africa),"{2015: ['2015-05-02', '2015-05-08'], 2016: ['2...",['Category:Short description is different from...,"['Category:1862 establishments in Mexico', 'Ca...","[420 (cannabis culture), Abbotsford, British C...","{2015: ['2015-04-24', '2015-04-30'], 2016: ['2...",['Category:South African companies established...,['Category:1994 establishments in South Africa...,"[1994 South African general election, Aparthei...",2,48,2016-01-05,1.598415e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6808225,Victory in Europe Day,Victory Day (9 May),"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-06', '2015-05-12'], 2016: ['2...","['Category:Amerika Häuser', 'Category:Ethnic c...","['Category:2015 disestablishments in Ukraine',...","[150th Rifle Division, 154th Preobrazhensky In...",1,477,2024-12-28,1.256349e-06
6808226,Victory in Europe Day,Victory Day (9 May),"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-06', '2015-05-12'], 2016: ['2...","['Category:Amerika Häuser', 'Category:Ethnic c...","['Category:2015 disestablishments in Ukraine',...","[150th Rifle Division, 154th Preobrazhensky In...",1,455,2024-12-29,1.146698e-06
6808227,Victory in Europe Day,Victory Day (9 May),"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-06', '2015-05-12'], 2016: ['2...","['Category:Amerika Häuser', 'Category:Ethnic c...","['Category:2015 disestablishments in Ukraine',...","[150th Rifle Division, 154th Preobrazhensky In...",1,474,2024-12-30,1.205486e-06
6808228,Victory in Europe Day,Victory Day (9 May),"{2015: ['2015-05-05', '2015-05-11'], 2016: ['2...","['Category:1945 crimes in Europe', 'Category:1...","['Category:1945 in Europe', 'Category:All arti...","[1985 Warsaw Victory Day Parade, 2005 Moscow V...","{2015: ['2015-05-06', '2015-05-12'], 2016: ['2...","['Category:Amerika Häuser', 'Category:Ethnic c...","['Category:2015 disestablishments in Ukraine',...","[150th Rifle Division, 154th Preobrazhensky In...",1,450,2024-12-31,1.224486e-06


In [None]:
pairs_df_new["in_source_window"] = window_df.groupby("index")["in_source_window"].any().reindex(pairs_df_new.index, fill_value=False)

In [964]:
pairs_df_new["in_source_window"].value_counts()


in_source_window
False    6686320
True      124565
Name: count, dtype: int64

In [985]:
view_flow.to_csv("view_flow_final.csv", index = False)

filter to event windows only

In [17]:
view_flow = pd.read_csv("view_flow_final.csv")

In [20]:
view_flow["source"].unique()

array(['Anzac Day', 'Armistice Day', 'Cinco de Mayo',
       'Constitution Day (Norway)', 'Festa della Repubblica',
       'Freedom Day (South Africa)', 'German Unity Day',
       'Great Union Day', 'Grito de Lares',
       'Independence Day (Bangladesh)', 'Independence Day (Brazil)',
       'Independence Day (Finland)', 'Independence Day (India)',
       'Independence Day (Indonesia)', 'Independence Day (Israel)',
       'Independence Day (Malaysia)', 'Independence Day (Nigeria)',
       'Independence Day (Pakistan)', 'Independence Day (Philippines)',
       'Independence Day (Sri Lanka)', 'Independence Day (United States)',
       'Independence Day of Ukraine', 'Jerusalem Day', 'Memorial Day',
       'National Day (Singapore)',
       'National Day for Truth and Reconciliation',
       "National Day of the People's Republic of China",
       'National Day of the Republic of China',
       'National Foundation Day (Japan)',
       'National Independence Day (Poland)',
       'National

In [971]:
in_source_window_true = pairs_df_new.loc[pairs_df_new.in_source_window]


In [974]:
pairs_df_new["year"] = pd.to_datetime(pairs_df_new["date"]).dt.year


Flag whether each row is inside the source event window

In [854]:
def in_window(row, window_col):
    year = row["year"]
    window = row[window_col]
    if pd.isna(window) or not isinstance(window, dict) or year not in window:
        return False
    start, end = pd.to_datetime(window[year])
    return start <= row["date"] <= end

pairs_df_new["in_source_window"] = pairs_df_new.apply(
    lambda row: in_window(row, "source_window"), axis=1
)


Compute the baseline

In [976]:
baseline = (
    pairs_df_new[~pairs_df_new["in_source_window"]]
    .groupby(["target", "year"])["normalized_views"]
    .mean()
    .reset_index()
    .rename(columns={"normalized_views": "baseline_norm"})
)


Attach baseline back to the datafram

In [977]:
pairs_df_new = pairs_df_new.merge(baseline, on=["target", "year"], how="left")


In [None]:
view_flow = (
    pairs_df_new[pairs_df_new["in_source_window"]]
    .groupby(["source", "target", "year"])
    .agg(
        observed_views=("normalized_views", "sum"),
        baseline_mean=("baseline_norm", "first"),  # daily baseline
        window_days=("normalized_views", "count")
    )
    .reset_index()
)

view_flow["baseline_total"] = view_flow["baseline_mean"] * view_flow["window_days"]
view_flow["view_flow"] = view_flow["observed_views"] - view_flow["baseline_total"]
