# Notebook 2: Analysis

Here is where we do the analysis portion of the homework

## Imports

In [1]:
import pandas as pd

## Data Loading

### Lets Load all the intermediate dataframes that will assist our analysis

In [2]:
## Geographic data
country_df = pd.read_csv('population_by_country.csv')
region_df = pd.read_csv('population_by_region.csv')
country_region_map_df = pd.read_csv('prb_org_regions_and_countries.csv') # described in the readme
pol_pop_df = pd.read_csv('wp_politicians_by_country.csv')

# article scores data
article_quality_df = pd.read_csv('predicted_scores.csv')
revision_ids_df = pd.read_csv('revision_ids_by_politician.csv')

#### Now we merge them into a single dataframe

 Lets deal with the article scores data first because theres less of it

In [3]:
# cleanup the column names so they match
article_quality_df.rename(columns={'Revision ID': 'revision_id',
                           'ORES Prediction': 'quality_prediction'}, inplace=True)

# Merge rev_ids_df with ores_scores on the Revision ID
pol_article_quality = pd.merge(revision_ids_df, article_quality_df, on = 'revision_id')
pol_article_quality.head()

Unnamed: 0,politician_name,revision_id,quality_prediction
0,Abdul Rahim Ayoubi,1226326055,Start
1,Amir Muhammad Akhundzada,1247931713,Start
2,Aziza Ahmadyar,1195651393,Start
3,Haroon al-Afghani,1230459615,B
4,Khadija Zahra Ahmadi,1234741562,Stub


Now lets merge that result with the politician and population dataframe `pol_pop_df`

In [4]:
politician_country_df = pol_article_quality.merge(pol_pop_df[['name', 'country', 'population_millions']], 
                                                    left_on='politician_name', 
                                                    right_on='name', 
                                                    how='left')

politician_country_df

Unnamed: 0,politician_name,revision_id,quality_prediction,name,country,population_millions
0,Abdul Rahim Ayoubi,1226326055,Start,Abdul Rahim Ayoubi,Afghanistan,42.4
1,Amir Muhammad Akhundzada,1247931713,Start,Amir Muhammad Akhundzada,Afghanistan,42.4
2,Aziza Ahmadyar,1195651393,Start,Aziza Ahmadyar,Afghanistan,42.4
3,Haroon al-Afghani,1230459615,B,Haroon al-Afghani,Afghanistan,42.4
4,Khadija Zahra Ahmadi,1234741562,Stub,Khadija Zahra Ahmadi,Afghanistan,42.4
...,...,...,...,...,...,...
5411,Denis Walker,1247902630,C,Denis Walker,Zimbabwe,16.7
5412,Herbert Ushewokunze,959111842,Stub,Herbert Ushewokunze,Zimbabwe,16.7
5413,Josiah Tongogara,1203429435,C,Josiah Tongogara,Zimbabwe,16.7
5414,Langton Towungana,1246280093,Stub,Langton Towungana,Zimbabwe,16.7


In [5]:
# Step 3: Merge the result with prb_org_regions_and_countries_df to get the region information
politician_full_df = politician_country_df.merge(country_region_map_df[['Country', 'Region']], 
                                                 left_on='country', 
                                                 right_on='Country', 
                                                 how='left')

politician_full_df

Unnamed: 0,politician_name,revision_id,quality_prediction,name,country,population_millions,Country,Region
0,Abdul Rahim Ayoubi,1226326055,Start,Abdul Rahim Ayoubi,Afghanistan,42.4,Afghanistan,South Asia
1,Amir Muhammad Akhundzada,1247931713,Start,Amir Muhammad Akhundzada,Afghanistan,42.4,Afghanistan,South Asia
2,Aziza Ahmadyar,1195651393,Start,Aziza Ahmadyar,Afghanistan,42.4,Afghanistan,South Asia
3,Haroon al-Afghani,1230459615,B,Haroon al-Afghani,Afghanistan,42.4,Afghanistan,South Asia
4,Khadija Zahra Ahmadi,1234741562,Stub,Khadija Zahra Ahmadi,Afghanistan,42.4,Afghanistan,South Asia
...,...,...,...,...,...,...,...,...
5411,Denis Walker,1247902630,C,Denis Walker,Zimbabwe,16.7,Zimbabwe,Eastern Africa
5412,Herbert Ushewokunze,959111842,Stub,Herbert Ushewokunze,Zimbabwe,16.7,Zimbabwe,Eastern Africa
5413,Josiah Tongogara,1203429435,C,Josiah Tongogara,Zimbabwe,16.7,Zimbabwe,Eastern Africa
5414,Langton Towungana,1246280093,Stub,Langton Towungana,Zimbabwe,16.7,Zimbabwe,Eastern Africa


In [6]:
# Lets make sure the formatting matches
politician_full_df["Region"] = politician_full_df["Region"].str.upper()

# rename some columns to be more specific 
politician_full_df.rename(columns={'revision_id': 'article_revision_id',
                           'population_millions': 'country_population_millions'}, inplace=True)

region_df.rename(columns={'population_millions': 'region_population_millions'}, inplace=True)

# Merge the final dataframe with population_by_region_df to add region population
final_df = politician_full_df.merge(region_df,
                                    left_on='Region',
                                    right_on='region',
                                    how='left')

Lets cleanup the final_df and remove redundant columns

In [8]:
final_df = final_df[["politician_name", "article_revision_id",
                     "quality_prediction",
                     "country", "country_population_millions",
                     "region", "region_population_millions"]]

# take a look at it all cleaned up

In [9]:
final_df

Unnamed: 0,politician_name,article_revision_id,quality_prediction,country,country_population_millions,region,region_population_millions
0,Abdul Rahim Ayoubi,1226326055,Start,Afghanistan,42.4,SOUTH ASIA,2029.0
1,Amir Muhammad Akhundzada,1247931713,Start,Afghanistan,42.4,SOUTH ASIA,2029.0
2,Aziza Ahmadyar,1195651393,Start,Afghanistan,42.4,SOUTH ASIA,2029.0
3,Haroon al-Afghani,1230459615,B,Afghanistan,42.4,SOUTH ASIA,2029.0
4,Khadija Zahra Ahmadi,1234741562,Stub,Afghanistan,42.4,SOUTH ASIA,2029.0
...,...,...,...,...,...,...,...
5411,Denis Walker,1247902630,C,Zimbabwe,16.7,EASTERN AFRICA,483.0
5412,Herbert Ushewokunze,959111842,Stub,Zimbabwe,16.7,EASTERN AFRICA,483.0
5413,Josiah Tongogara,1203429435,C,Zimbabwe,16.7,EASTERN AFRICA,483.0
5414,Langton Towungana,1246280093,Stub,Zimbabwe,16.7,EASTERN AFRICA,483.0


In [None]:
final_df.to_csv('population_by_region.csv', index=False)

## Analysis

Total articles per country and region:

In [11]:
total_articles_country = final_df.groupby('country')['article_revision_id'].count().reset_index(name='total_articles')
total_articles_region = final_df.groupby('region')['article_revision_id'].count().reset_index(name='total_articles')

In [26]:
# lets take a look: 
total_articles_country

Unnamed: 0,country,total_articles
0,Afghanistan,79
1,Albania,61
2,Algeria,64
3,Angola,29
4,Antigua and Barbuda,29
...,...,...
164,Venezuela,27
165,Vietnam,4
166,Yemen,32
167,Zambia,3


High-quality articles per country and region: Filter for high-quality articles (FA and GA):

In [12]:
# High-quality articles per country and region: Filter for high-quality articles (FA and GA):
high_quality_articles = final_df[final_df['quality_prediction'].isin(['FA', 'GA'])]
high_quality_country = high_quality_articles.groupby('country')['article_revision_id'].count().reset_index(name='high_quality_articles')
high_quality_region = high_quality_articles.groupby('region')['article_revision_id'].count().reset_index(name='high_quality_articles')


#### Articles per capita (Country):
Calculate total and high-quality articles per capita for countries:

In [36]:
df_country = final_df[['country', 'country_population_millions']].drop_duplicates()
# remove countries with pop of lower than 1 million, as we won't be able to do our calculation

country_data = total_articles_country.merge(high_quality_country, on='country', how='left')
country_data = country_data.merge(df_country, on='country', how='left')

## FILTERING
# Subset the data to exclude countries with zero total articles or where the pop is 0 (less than 1 million)
country_data = country_data[country_data['total_articles'] > 0]
country_data = country_data[country_data['country_population_millions'] > 0]

# Total articles per capita
country_data['articles_per_capita'] = country_data['total_articles'] / country_data['country_population_millions']

# High-quality articles per capita
country_data['high_quality_articles_per_capita'] = country_data['high_quality_articles'] / country_data['country_population_millions']

In [37]:
country_data.sort_values(by=['total_articles', 'country_population_millions'])

Unnamed: 0,country,total_articles,high_quality_articles,country_population_millions,articles_per_capita,high_quality_articles_per_capita
95,Malta,1,,0.6,1.666667,
83,Latvia,1,,1.9,0.526316,
111,Norway,1,,5.5,0.181818,
59,Grenada,2,,0.1,20.000000,
47,Equatorial Guinea,2,,1.7,1.176471,
...,...,...,...,...,...,...
68,Indonesia,107,15.0,278.7,0.383925,0.053821
76,Kenya,121,2.0,55.1,2.196007,0.036298
72,Italy,128,1.0,58.8,2.176871,0.017007
67,India,134,,1428.6,0.093798,


#### Articles per capita (Region):
Same as the country calculation but on the regional level:

In [34]:
df_region = final_df[['region', 'region_population_millions']].drop_duplicates()

region_data = total_articles_region.merge(high_quality_region, on='region', how='left')
region_data = region_data.merge(df_region, on='region', how='left')

## FILTERING
# Exclude regions with zero total articles
region_data = region_data[region_data['total_articles'] > 0]
# Exclude regions with a population of 0
region_data = region_data[region_data['region_population_millions'] > 0]


# Total articles per capita
region_data['articles_per_capita'] = region_data['total_articles'] / region_data['region_population_millions']

# High-quality articles per capita
region_data['high_quality_articles_per_capita'] = region_data['high_quality_articles'] / region_data['region_population_millions']

In [35]:
region_data

Unnamed: 0,region,total_articles,high_quality_articles,region_population_millions,articles_per_capita,high_quality_articles_per_capita
0,CARIBBEAN,145,2,44.0,3.295455,0.045455
1,CENTRAL AMERICA,90,4,182.0,0.494505,0.021978
2,CENTRAL ASIA,94,3,80.0,1.175,0.0375
3,EAST ASIA,185,12,1648.0,0.112257,0.007282
4,EASTERN AFRICA,641,16,483.0,1.327122,0.033126
5,EASTERN EUROPE,442,25,285.0,1.550877,0.087719
6,MIDDLE AFRICA,116,4,202.0,0.574257,0.019802
7,NORTHERN AFRICA,279,14,256.0,1.089844,0.054688
8,NORTHERN EUROPE,130,4,108.0,1.203704,0.037037
9,OCEANIA,58,1,45.0,1.288889,0.022222


## Display Results

In [38]:
top_10_countries_coverage = country_data.nlargest(10, 'articles_per_capita')
top_10_countries_coverage

Unnamed: 0,country,total_articles,high_quality_articles,country_population_millions,articles_per_capita,high_quality_articles_per_capita
4,Antigua and Barbuda,29,,0.1,290.0,
51,Federated States of Micronesia,13,,0.1,130.0,
96,Marshall Islands,12,,0.1,120.0,
12,Barbados,24,,0.3,80.0,
128,Seychelles,6,,0.1,60.0,
17,Bhutan,40,,0.8,50.0,
93,Maldives,30,1.0,0.6,50.0,1.666667
152,Tonga,5,,0.1,50.0,
141,St. Vincent and the Grenadines,4,,0.1,40.0,
89,Luxembourg,22,2.0,0.7,31.428571,2.857143


In [39]:
bottom_10_countries_coverage = country_data.nsmallest(10, 'articles_per_capita')
bottom_10_countries_coverage

Unnamed: 0,country,total_articles,high_quality_articles,country_population_millions,articles_per_capita,high_quality_articles_per_capita
31,China,12,,1411.3,0.008503,
165,Vietnam,4,,98.9,0.040445,
125,Saudi Arabia,2,,36.9,0.054201,
57,Ghana,3,1.0,34.1,0.087977,0.029326
67,India,134,,1428.6,0.093798,
167,Zambia,3,,20.2,0.148515,
111,Norway,1,,5.5,0.181818,
155,Turkey,16,,85.6,0.186916,
71,Israel,2,,9.8,0.204082,
37,Cote d'Ivoire,7,,30.9,0.226537,


In [41]:
top_10_countries_high_quality = country_data.nlargest(10, 'high_quality_articles_per_capita')
top_10_countries_high_quality

Unnamed: 0,country,total_articles,high_quality_articles,country_population_millions,articles_per_capita,high_quality_articles_per_capita
89,Luxembourg,22,2.0,0.7,31.428571,2.857143
1,Albania,61,6.0,2.7,22.592593,2.222222
79,Kosovo,16,3.0,1.7,9.411765,1.764706
93,Maldives,30,1.0,0.6,50.0,1.666667
63,Guyana,16,1.0,0.8,20.0,1.25
114,Palestinian Territory,57,6.0,5.5,10.363636,1.090909
38,Croatia,18,4.0,3.8,4.736842,1.052632
153,Trinidad and Tobago,16,1.0,1.4,11.428571,0.714286
10,Bahrain,40,1.0,1.6,25.0,0.625
144,Switzerland,68,5.0,8.8,7.727273,0.568182


In [42]:
bottom_10_countries_high_quality = country_data.nsmallest(10, 'high_quality_articles_per_capita')
bottom_10_countries_high_quality

Unnamed: 0,country,total_articles,high_quality_articles,country_population_millions,articles_per_capita,high_quality_articles_per_capita
11,Bangladesh,68,1.0,173.5,0.391931,0.005764
45,Egypt,31,1.0,105.2,0.294677,0.009506
50,Ethiopia,42,2.0,126.5,0.332016,0.01581
73,Japan,93,2.0,124.5,0.746988,0.016064
113,Pakistan,91,4.0,240.5,0.378378,0.016632
72,Italy,128,1.0,58.8,2.176871,0.017007
32,Colombia,30,1.0,52.2,0.574713,0.019157
35,Congo DR,45,2.0,102.3,0.439883,0.01955
158,Uganda,65,1.0,48.6,1.337449,0.020576
2,Algeria,64,1.0,46.8,1.367521,0.021368


In [43]:
regions_by_total_coverage = region_data.sort_values('articles_per_capita', ascending=False)
regions_by_total_coverage

Unnamed: 0,region,total_articles,high_quality_articles,region_population_millions,articles_per_capita,high_quality_articles_per_capita
0,CARIBBEAN,145,2,44.0,3.295455,0.045455
14,SOUTHERN EUROPE,395,24,152.0,2.598684,0.157895
17,WESTERN EUROPE,370,15,199.0,1.859296,0.075377
16,WESTERN ASIA,538,23,299.0,1.799331,0.076923
13,SOUTHERN AFRICA,117,7,70.0,1.671429,0.1
5,EASTERN EUROPE,442,25,285.0,1.550877,0.087719
4,EASTERN AFRICA,641,16,483.0,1.327122,0.033126
9,OCEANIA,58,1,45.0,1.288889,0.022222
8,NORTHERN EUROPE,130,4,108.0,1.203704,0.037037
2,CENTRAL ASIA,94,3,80.0,1.175,0.0375


In [44]:
regions_by_high_quality_coverage = region_data.sort_values('high_quality_articles_per_capita', ascending=False)
regions_by_high_quality_coverage

Unnamed: 0,region,total_articles,high_quality_articles,region_population_millions,articles_per_capita,high_quality_articles_per_capita
14,SOUTHERN EUROPE,395,24,152.0,2.598684,0.157895
13,SOUTHERN AFRICA,117,7,70.0,1.671429,0.1
5,EASTERN EUROPE,442,25,285.0,1.550877,0.087719
16,WESTERN ASIA,538,23,299.0,1.799331,0.076923
17,WESTERN EUROPE,370,15,199.0,1.859296,0.075377
7,NORTHERN AFRICA,279,14,256.0,1.089844,0.054688
0,CARIBBEAN,145,2,44.0,3.295455,0.045455
2,CENTRAL ASIA,94,3,80.0,1.175,0.0375
8,NORTHERN EUROPE,130,4,108.0,1.203704,0.037037
10,SOUTH AMERICA,307,15,426.0,0.720657,0.035211
