# Notebook 1

This is the first notebook for doing initial data cleaning and retrieving information from wikipedia 

### Imports

In [1]:
import os
import time
import json
import requests
import urllib.parse
import pandas as pd
from typing import Any, Dict, List, Optional

In [15]:
import wiki_requests as wr

## Data Loading and Cleaning

### Load the provided politicans and population datasets

In [2]:
politicians_df = pd.read_csv('politicians_by_country_AUG2024.csv')
population_df = pd.read_csv('population_by_country_AUG2024.csv')

Display the first few rows of each dataframe to inspect structure

In [3]:
politicians_df.head() 

Unnamed: 0,name,url,country
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan


In [4]:
population_df.head()

Unnamed: 0,Geography,Population
0,WORLD,8009.0
1,AFRICA,1453.0
2,NORTHERN AFRICA,256.0
3,Algeria,46.8
4,Egypt,105.2


### Clean & Standardize the data
We need to clean and standardize the country names in both datasets to ensure the entries are consistent.

We will also exclude cumulative regional rows (like "AFRICA", "NORTHERN AFRICA") from the country-specific analysis but retain them in a separate dataframe for regional-level analysis.

In [5]:
# Clean the population dataset
# Separate out regions (all caps) and countries
regions_df = population_df[population_df['Geography'].str.isupper()]
countries_df = population_df[~population_df['Geography'].str.isupper()]

# Rename columns to have a consistent naming format
countries_df = countries_df.rename(columns={'Geography': 'country', 'Population': 'population_millions'})
regions_df = regions_df.rename(columns={'Geography': 'region', 'Population': 'population_millions'})

# Standardize the column names for the politicians dataset as well
politicians_df = politicians_df.rename(columns={'country': 'country'})

In [6]:
# lets save the interum dataframes
countries_df.to_csv('population_by_country.csv', index=False)
regions_df.to_csv('population_by_region.csv', index=False)

Let’s view the intermediate cleaned dataframes

In [7]:
politicians_df['country'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Austria',
       'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados',
       'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan', 'Guinea-Bissau',
       'Bolivia', 'Bosnia Herzegovina', 'Botswana', 'Brazil', 'Bulgaria',
       'Burkina Faso', 'Myanmar', 'Burundi', 'Cambodia', 'Cameroon',
       'Cape Verde', 'Central African Republic', 'Chad', 'Chile', 'China',
       'Colombia', 'Comoros', 'Congo', 'Congo DR', 'Costa Rica',
       'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Djibouti',
       'Dominican Republic', 'Timor Leste', 'Ecuador', 'Egypt',
       'United Arab Emirates', 'Equatorial Guinea', 'Eritrea', 'Estonia',
       'Ethiopia', 'Finland', 'France', 'Gabon', 'Gambia', 'Germany',
       'Ghana', 'Greece', 'Grenada', 'Guatemala', 'Guinea', 'Guyana',
       'Haiti', 'Honduras', 'Hungary', 'India', 'Indonesia', 'Iran',
       'Iraq', 'Israel', 'Italy', "Cote d'Ivoire

In [8]:
countries_df.head()

Unnamed: 0,country,population_millions
3,Algeria,46.8
4,Egypt,105.2
5,Libya,6.9
6,Morocco,37.0
7,Sudan,48.1


In [9]:
countries_df["country"].unique()

array(['Algeria', 'Egypt', 'Libya', 'Morocco', 'Sudan', 'Tunisia',
       'Western Sahara', 'Benin', 'Burkina Faso', 'Cape Verde',
       "Cote d'Ivoire", 'Gambia', 'Ghana', 'Guinea', 'GuineaBissau',
       'Liberia', 'Mali', 'Mauritania', 'Niger', 'Nigeria', 'Senegal',
       'Sierra Leone', 'Togo', 'Burundi', 'Comoros', 'Djibouti',
       'Eritrea', 'Ethiopia', 'Kenya', 'Madagascar', 'Malawi',
       'Mauritius', 'Mayotte', 'Mozambique', 'Reunion', 'Rwanda',
       'Seychelles', 'Somalia', 'South Sudan', 'Tanzania', 'Uganda',
       'Zambia', 'Zimbabwe', 'Angola', 'Cameroon',
       'Central African Republic', 'Chad', 'Congo', 'Congo DR',
       'Equatorial Guinea', 'Gabon', 'Sao Tome and Principe', 'Botswana',
       'eSwatini', 'Lesotho', 'Namibia', 'South Africa', 'Canada',
       'United States', 'Belize', 'Costa Rica', 'El Salvador',
       'Guatemala', 'Honduras', 'Mexico', 'Nicaragua', 'Panama',
       'Antigua and Barbuda', 'Bahamas', 'Barbados', 'Cuba', 'Curacao',
       'Do

The one thing I'm noticing is that in the politicians df there is a country 'Korean'- which I'm not sure if that means Korea (North) because there is an n at the end of it. That might be something to keep an eye on. 

In [10]:
regions_df.head()

Unnamed: 0,region,population_millions
0,WORLD,8009.0
1,AFRICA,1453.0
2,NORTHERN AFRICA,256.0
10,WESTERN AFRICA,442.0
27,EASTERN AFRICA,483.0


In [11]:
regions_df["region"].unique()

array(['WORLD', 'AFRICA', 'NORTHERN AFRICA', 'WESTERN AFRICA',
       'EASTERN AFRICA', 'MIDDLE AFRICA', 'SOUTHERN AFRICA',
       'NORTHERN AMERICA', 'LATIN AMERICA AND THE CARIBBEAN',
       'CENTRAL AMERICA', 'CARIBBEAN', 'SOUTH AMERICA', 'ASIA',
       'WESTERN ASIA', 'CENTRAL ASIA', 'SOUTH ASIA', 'SOUTHEAST ASIA',
       'EAST ASIA', 'EUROPE', 'NORTHERN EUROPE', 'WESTERN EUROPE',
       'EASTERN EUROPE', 'SOUTHERN EUROPE', 'OCEANIA'], dtype=object)

### Matching and merging the politicians & population datasets
We will now merge the politicans and population datasets, and log any any countries from the politicans dataset that don't have a corresponding population entry.

In [12]:
# Merge the politicians dataset with the country population dataset
# Performing a left join to retain all politicians even if there is no population match
pol_pop_df = pd.merge(politicians_df,
                      countries_df,
                      how='left', on='country')

# Identify countries from the politicians dataset that don't have a match in the population data
no_match_countries = pol_pop_df[pol_pop_df['population_millions'].isna()]['country'].unique()

# Save the no-match countries to a text file
no_match_file_path = r'C:\Users\clark.roll\python coding\code_personal\Data512\Homework\Assignment 2\wp_countries-no_match.txt'
with open(no_match_file_path, 'w') as file:
    for country in no_match_countries:
        file.write(f"{country}\n")

# Save the cleaned and merged dataset as a CSV
pol_pop_df.to_csv('wp_politicians_by_country.csv', index=False)

Lets look at the merged dataframe

In [13]:
pol_pop_df

Unnamed: 0,name,url,country,population_millions
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,42.4
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,42.4
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,42.4
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,42.4
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,42.4
...,...,...,...,...
7150,Josiah Tongogara,https://en.wikipedia.org/wiki/Josiah_Tongogara,Zimbabwe,16.7
7151,Langton Towungana,https://en.wikipedia.org/wiki/Langton_Towungana,Zimbabwe,16.7
7152,Sengezo Tshabangu,https://en.wikipedia.org/wiki/Sengezo_Tshabangu,Zimbabwe,16.7
7153,Herbert Ushewokunze,https://en.wikipedia.org/wiki/Herbert_Ushewokunze,Zimbabwe,16.7


It looks like the 'korean' entry that I was hesitant about in the politicians dataframe was logged as no matches, so that takes care of that.

If that were a particularly important entry, then we could lookup the correct information and manually adjust it, but I think this will be fine for this project

## Retrieve Information from WIKI

### Prep

Lets start by prepping to make requests. 

We know that we need our article titles to be URL encoded so lets take care of that now by adding a column to the dataframe called `article_title` that is the URL encoded name

In [16]:
pol_pop_df['article_title'] = pol_pop_df['name'].apply(wr.prep_article_title)

In [17]:
pol_pop_df

Unnamed: 0,name,url,country,population_millions,article_title
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,42.4,Majah_Ha_Adrif
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,42.4,Haroon_al-Afghani
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,42.4,Tayyab_Agha
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,42.4,Khadija_Zahra_Ahmadi
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,42.4,Aziza_Ahmadyar
...,...,...,...,...,...
7150,Josiah Tongogara,https://en.wikipedia.org/wiki/Josiah_Tongogara,Zimbabwe,16.7,Josiah_Tongogara
7151,Langton Towungana,https://en.wikipedia.org/wiki/Langton_Towungana,Zimbabwe,16.7,Langton_Towungana
7152,Sengezo Tshabangu,https://en.wikipedia.org/wiki/Sengezo_Tshabangu,Zimbabwe,16.7,Sengezo_Tshabangu
7153,Herbert Ushewokunze,https://en.wikipedia.org/wiki/Herbert_Ushewokunze,Zimbabwe,16.7,Herbert_Ushewokunze


### Get Page Info (and revision id)

#### Single Article

Lets look at a single article to see how the page info is structured

In [19]:
# lets get information for 1 article. 
example_article = pol_pop_df['article_title'][0]
example_article

'Majah_Ha_Adrif'

In [20]:
wr.request_pageinfo_per_article(example_article)

{'batchcomplete': '',
 'query': {'normalized': [{'from': 'Majah_Ha_Adrif', 'to': 'Majah Ha Adrif'}],
  'pages': {'10483286': {'pageid': 10483286,
    'ns': 0,
    'title': 'Majah Ha Adrif',
    'contentmodel': 'wikitext',
    'pagelanguage': 'en',
    'pagelanguagehtmlcode': 'en',
    'pagelanguagedir': 'ltr',
    'touched': '2024-09-30T14:32:18Z',
    'lastrevid': 1233202991,
    'length': 3188,
    'talkid': 13330265,
    'fullurl': 'https://en.wikipedia.org/wiki/Majah_Ha_Adrif',
    'editurl': 'https://en.wikipedia.org/w/index.php?title=Majah_Ha_Adrif&action=edit',
    'canonicalurl': 'https://en.wikipedia.org/wiki/Majah_Ha_Adrif'}}}}

You could cycle through the code above in a for-loop to get the page information per article but that would take **BUT** it would take **~1-2 hours to run**.

Instead I would suggest running as a batch, as is shown here. 

#### Multiple Articles

You can adjust the batch size to get this to run faster but **do not exceed a batch size of 50**

In [22]:
all_article_titles = pol_pop_df["article_title"].tolist()

In [23]:
all_pages_info = wr.get_page_info_for_multiple_articles(all_article_titles, 
                                                        batch_size=10,
                                                        output_file="page_info_output.json")

Fetching page info data for: Majah_Ha_Adrif|Haroon_al-Afghani|Tayyab_Agha|Khadija_Zahra_Ahmadi|Aziza_Ahmadyar|Muqadasa_Ahmadzai|Mohammad_Sarwar_Ahmedzai|Amir_Muhammad_Akhundzada|Nasrullah_Baryalai_Arsalai|Abdul_Rahim_Ayoubi
Page info saved to page_info_output.json
Fetching page info data for: Ismael_Balkhi|Abdul_Baqi_Turkistani|Mohammad_Ghous_Bashiri|Jan_Baz|Bashir_Ahmad_Bezan|Rafiullah_Bidar|Mohammad_Siddiq_Chakari|Cheragh_Ali_Cheragh|Nasir_Ahmad_Durrani|Muhammad_Hashim_Esmatullahi
Page info saved to page_info_output.json
Fetching page info data for: Ezatullah_%28Nangarhar%29|Aimal_Faizi|Gajinder_Singh_Safri|Sharif_Ghalib|Hashmat_Ghani_Ahmadzai|Abdul_Ghani_Ghani|Ghulam_Ghaus|Ghulam_Muhammad_Ghobar|Mohammad_Gul_%28Helmand_Council%29|Sayed_Yousuf_Halim
Page info saved to page_info_output.json
Fetching page info data for: Rangina_Hamidi|Sayed_Zafar_Hashemi|Qutbuddin_Hilal|Mahboba_Hoqomal|Musa_Hotak|Mirza_Muhammad_Ismail|Sayed_Jalal|Said_Tayeb_Jawad|Sayed_Jalal_Karim|Hafizullah_Shabaz_Kha

In [25]:
## Lets take a look at the output!
all_pages_info

{'44482763': {'pageid': 44482763,
  'ns': 0,
  'title': 'Abdul Rahim Ayoubi',
  'contentmodel': 'wikitext',
  'pagelanguage': 'en',
  'pagelanguagehtmlcode': 'en',
  'pagelanguagedir': 'ltr',
  'touched': '2024-10-14T14:00:56Z',
  'lastrevid': 1226326055,
  'length': 7313,
  'talkid': 44489967,
  'fullurl': 'https://en.wikipedia.org/wiki/Abdul_Rahim_Ayoubi',
  'editurl': 'https://en.wikipedia.org/w/index.php?title=Abdul_Rahim_Ayoubi&action=edit',
  'canonicalurl': 'https://en.wikipedia.org/wiki/Abdul_Rahim_Ayoubi'},
 '12084570': {'pageid': 12084570,
  'ns': 0,
  'title': 'Amir Muhammad Akhundzada',
  'contentmodel': 'wikitext',
  'pagelanguage': 'en',
  'pagelanguagehtmlcode': 'en',
  'pagelanguagedir': 'ltr',
  'touched': '2024-10-05T14:27:31Z',
  'lastrevid': 1247931713,
  'length': 8865,
  'talkid': 23510494,
  'fullurl': 'https://en.wikipedia.org/wiki/Amir_Muhammad_Akhundzada',
  'editurl': 'https://en.wikipedia.org/w/index.php?title=Amir_Muhammad_Akhundzada&action=edit',
  'canoni

In [26]:
# first we extract the article title and revision id from the page information dictionary
all_revision_ids = wr.extract_revision_ids_from_page_info(all_pages_info)
all_revision_ids

Unnamed: 0,politician_name,revision_id
0,Abdul Rahim Ayoubi,1226326055
1,Amir Muhammad Akhundzada,1247931713
2,Aziza Ahmadyar,1195651393
3,Haroon al-Afghani,1230459615
4,Khadija Zahra Ahmadi,1234741562
...,...,...
5413,Denis Walker,1247902630
5414,Herbert Ushewokunze,959111842
5415,Josiah Tongogara,1203429435
5416,Langton Towungana,1246280093


In [27]:
# Since this took soo long to generate, lets save it to a csv
all_revision_ids.to_csv('revision_ids_by_politician.csv', index=False)

#### Get the predicted ORES article quality score

Now lets get the article quality for a given revision id

Again lets look at 1 to get a sense of the schema

In [28]:
# Get the ORES info for a single article revision and take a look at the structure:

# get a revid 
example_revID = all_revision_ids["revision_id"][7]

# make the request
wr.request_ores_score_per_article(int(example_revID))

{'enwiki': {'models': {'articlequality': {'version': '0.9.2'}},
  'scores': {'1235521766': {'articlequality': {'score': {'prediction': 'Start',
      'probability': {'B': 0.05357764010649692,
       'C': 0.1420153651045843,
       'FA': 0.004558064005040723,
       'GA': 0.017818853559952434,
       'Start': 0.7365463681382165,
       'Stub': 0.04548370908570939}}}}}}}

#### Prep the revision_ids to look up ORES Predicition scores

python dataframes tend to save ints at int64 or int32 so we need to make a list of just ints to put into our function to get ores predicition scores for a revision id

In [30]:
rev_ids_list = all_revision_ids["revision_id"].tolist()
rev_ids_list = list(map(int, rev_ids_list))

Alright we're all set up, lets run this!

**This function takes a very long time to run!**

In [29]:
ores_scores = wr.request_all_ores_scores(rev_ids_list)

NameError: name 'request_all_ores_scores' is not defined

In [51]:
ores_scores

Unnamed: 0,Revision ID,ORES Prediction
0,1226326055,Start
1,1247931713,Start
2,1195651393,Start
3,1230459615,B
4,1234741562,Stub
...,...,...
5413,1247902630,C
5414,959111842,Stub
5415,1203429435,C
5416,1246280093,Stub


Okay lets clean up some of the dataframes column names so we can merge them

In [55]:
ores_scores.rename(columns={'Revision ID': 'revision_id',
                           'ORES Prediction': 'quality_prediction'}, inplace=True)

In [56]:
rev_ids_df.rename(columns={'revid': 'revision_id',
                           'title': 'politician_name'}, inplace=True)

In [57]:
# Merge rev_ids_df with ores_scores on the Revision ID
rev_ores_df = pd.merge(rev_ids_df, ores_scores, on = 'revision_id')
rev_ores_df

Unnamed: 0,politician_name,revision_id,quality_prediction
0,Abdul Rahim Ayoubi,1226326055,Start
1,Amir Muhammad Akhundzada,1247931713,Start
2,Aziza Ahmadyar,1195651393,Start
3,Haroon al-Afghani,1230459615,B
4,Khadija Zahra Ahmadi,1234741562,Stub
...,...,...,...
5413,Denis Walker,1247902630,C
5414,Herbert Ushewokunze,959111842,Stub
5415,Josiah Tongogara,1203429435,C
5416,Langton Towungana,1246280093,Stub


In [58]:
# Merge merged_df with the rev_ores_df on the article title and title columns
full_merged_df = pd.merge(merged_df, rev_ores_df, left_on='name', right_on='politician_name')
full_merged_df

Unnamed: 0,name,url,country,population_millions,article_title,politician_name,revision_id,quality_prediction
0,Majah Ha Adrif,https://en.wikipedia.org/wiki/Majah_Ha_Adrif,Afghanistan,42.4,Majah_Ha_Adrif,Majah Ha Adrif,1233202991,Start
1,Haroon al-Afghani,https://en.wikipedia.org/wiki/Haroon_al-Afghani,Afghanistan,42.4,Haroon_al-Afghani,Haroon al-Afghani,1230459615,B
2,Tayyab Agha,https://en.wikipedia.org/wiki/Tayyab_Agha,Afghanistan,42.4,Tayyab_Agha,Tayyab Agha,1225661708,Start
3,Khadija Zahra Ahmadi,https://en.wikipedia.org/wiki/Khadija_Zahra_Ah...,Afghanistan,42.4,Khadija_Zahra_Ahmadi,Khadija Zahra Ahmadi,1234741562,Stub
4,Aziza Ahmadyar,https://en.wikipedia.org/wiki/Aziza_Ahmadyar,Afghanistan,42.4,Aziza_Ahmadyar,Aziza Ahmadyar,1195651393,Start
...,...,...,...,...,...,...,...,...
5440,Josiah Tongogara,https://en.wikipedia.org/wiki/Josiah_Tongogara,Zimbabwe,16.7,Josiah_Tongogara,Josiah Tongogara,1203429435,C
5441,Langton Towungana,https://en.wikipedia.org/wiki/Langton_Towungana,Zimbabwe,16.7,Langton_Towungana,Langton Towungana,1246280093,Stub
5442,Sengezo Tshabangu,https://en.wikipedia.org/wiki/Sengezo_Tshabangu,Zimbabwe,16.7,Sengezo_Tshabangu,Sengezo Tshabangu,1228478288,Start
5443,Herbert Ushewokunze,https://en.wikipedia.org/wiki/Herbert_Ushewokunze,Zimbabwe,16.7,Herbert_Ushewokunze,Herbert Ushewokunze,959111842,Stub


In [None]:
fully_merged.drop_duplicates(subset=['brand'])

In [59]:
merged_df[merged_df.duplicated(subset=['article_title'], keep=False)]

Unnamed: 0,name,url,country,population_millions,article_title
151,Visar Ymeri,https://en.wikipedia.org/wiki/Visar_Ymeri,Albania,2.7,Visar_Ymeri
393,Hrant Maloyan,https://en.wikipedia.org/wiki/Hrant_Maloyan,Armenia,3.0,Hrant_Maloyan
424,Count Václav Antonín Chotek of Chotkov and Vojnín,https://en.wikipedia.org/wiki/Count_Václav_Ant...,Austria,9.2,Count_V%C3%A1clav_Anton%C3%ADn_Chotek_of_Chotk...
438,Eduard Hedvicek,https://en.wikipedia.org/wiki/Eduard_Hedvicek,Austria,9.2,Eduard_Hedvicek
485,"Leopold, Count von Thun und Hohenstein","https://en.wikipedia.org/wiki/Leopold,_Count_v...",Austria,9.2,Leopold%2C_Count_von_Thun_und_Hohenstein
...,...,...,...,...,...
6504,Torokul Dzhanuzakov,https://en.wikipedia.org/wiki/Torokul_Dzhanuzakov,Tajikistan,10.2,Torokul_Dzhanuzakov
6591,Yat Hwaidi,https://en.wikipedia.org/wiki/Yat_Hwaidi,Thailand,66.0,Yat_Hwaidi
6815,Sergey Abisov,https://en.wikipedia.org/wiki/Sergey_Abisov,Ukraine,36.7,Sergey_Abisov
6827,Moshe Gutman,https://en.wikipedia.org/wiki/Moshe_Gutman,Ukraine,36.7,Moshe_Gutman


In [None]:
df.rename(columns={'politician_name': 'article_title'}, inplace=True)

In [None]:
# Add the revision IDs to the DataFrame
merged_df['revision_id'] = merged_df['name'].map(all_revision_ids)

In [None]:
merged_df[merged_df["revision_id"].notnull()]