## 04-pandas

In [52]:
import pandas as pd
from urllib.request import urlretrieve

### Data 1 - 'countries'

In [54]:
urlretrieve('https://gist.githubusercontent.com/aakashns/28b2e504b3350afd9bdb157893f9725c/raw/994b65665757f4f8887db1c85986a897abb23d84/countries.csv', 
            'countries.csv')

df = pd.read_csv('countries.csv')

df.head(1)

Unnamed: 0,location,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita
0,Afghanistan,Asia,38928341.0,64.83,0.5,1803.987


#### Q1: How many countries does the dataframe contain?

In [10]:
df.shape[0]

210

#### Q2: Retrieve a list of continents from the dataframe?

In [16]:
# unique values
df['continent'].unique()

array(['Asia', 'Europe', 'Africa', 'North America', 'South America',
       'Oceania'], dtype=object)

#### Q3: What is the total population of all the countries listed in this dataset?

In [11]:
df['population'].sum()

7757980095.0

#### (Optional) What is the overall life expectancy across in the world?

In [31]:
x = df['life_expectancy'].sum()
y = df['life_expectancy'].count()
x/y

73.52985507246377

#### Q4: Create a dataframe containing 10 countries with the highest population.

In [30]:
df.sort_values('population', ascending=False).head(10)

Unnamed: 0,location,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita
41,China,Asia,1439324000.0,76.91,4.34,15308.712
90,India,Asia,1380004000.0,69.66,0.53,6426.674
199,United States,North America,331002600.0,78.86,2.77,54225.446
91,Indonesia,Asia,273523600.0,71.72,1.04,11188.744
145,Pakistan,Asia,220892300.0,67.27,0.6,5034.708
27,Brazil,South America,212559400.0,75.88,2.2,14103.452
141,Nigeria,Africa,206139600.0,54.69,,5338.454
15,Bangladesh,Asia,164689400.0,72.59,0.8,3523.984
157,Russia,Europe,145934500.0,72.58,8.05,24765.954
125,Mexico,North America,128932800.0,75.05,1.38,17336.469


#### Q5: Add a new column in `countries_df` to record the overall GDP per country (product of population & per capita GDP).

In [46]:
# add new column
df['countries_df'] = df['population'] * df['gdp_per_capita']
df.head(1)

Unnamed: 0,location,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,countries_df
0,Afghanistan,Asia,38928341.0,64.83,0.5,1803.987,70226220000.0


#### Q6: Create a data frame that counts the number countries in each continent?

In [47]:
df.groupby('continent')['location'].count()

continent
Africa           55
Asia             47
Europe           51
North America    36
Oceania           8
South America    13
Name: location, dtype: int64

#### Q7: Create a data frame showing the total population of each continent.

In [48]:
df.groupby('continent')['population'].sum()

continent
Africa           1.339424e+09
Asia             4.607388e+09
Europe           7.485062e+08
North America    5.912425e+08
Oceania          4.095832e+07
South America    4.304611e+08
Name: population, dtype: float64

---

### Data 2 - 'covid-countries-data'

In [59]:
urlretrieve('https://gist.githubusercontent.com/aakashns/b2a968a6cfd9fbbb0ff3d6bd0f26262b/raw/b115ed1dfa17f10fc88bf966236cd4d9032f1df8/covid-countries-data.csv', 
            'covid-countries-data.csv')

df = pd.read_csv('covid-countries-data.csv')

df.head(1)

Unnamed: 0,location,total_cases,total_deaths,total_tests
0,Afghanistan,38243.0,1409.0,


#### Q8: Count the number of countries for which the total_tests data is missing.

In [58]:
# isna for total_test
df['total_tests'].isna().sum()

122

#### Q9: Merge countries_df with covid_data_df on the location column.

In [62]:
countries_df = pd.read_csv('countries.csv')

df = merged_df = df.merge(countries_df, on='location', how='left')
df.head(1)

Unnamed: 0,location,total_cases,total_deaths,total_tests,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita
0,Afghanistan,38243.0,1409.0,,Asia,38928341.0,64.83,0.5,1803.987


#### Q10: Add columns tests_per_million, cases_per_million and deaths_per_million into combined_df.

In [66]:
df['tests_per_million'] = df['total_tests'] / df['population'] * 1000000
df['cases_per_million'] = df['total_cases'] / df['population'] * 1000000
df['deaths_per_million'] = df['total_deaths'] / df['population'] * 1000000
df.head(1)

Unnamed: 0,location,total_cases,total_deaths,total_tests,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,tests_per_million,cases_per_million,deaths_per_million
0,Afghanistan,38243.0,1409.0,,Asia,38928341.0,64.83,0.5,1803.987,,982.394806,36.19471


#### Q11: Create a dataframe with 10 countires that have highest number of tests per million people.

In [67]:
df.sort_values('tests_per_million', ascending=False).head(10)

Unnamed: 0,location,total_cases,total_deaths,total_tests,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,tests_per_million,cases_per_million,deaths_per_million
198,United Arab Emirates,71540.0,387.0,7177430.0,Asia,9890400.0,77.97,1.2,67293.483,725696.635121,7233.276713,39.128852
14,Bahrain,52440.0,190.0,1118837.0,Asia,1701583.0,77.29,2.0,43290.705,657527.137965,30818.36149,111.66073
116,Luxembourg,7928.0,124.0,385820.0,Europe,625976.0,82.25,4.51,94277.965,616349.508607,12665.022301,198.090662
123,Malta,1931.0,13.0,188539.0,Europe,441539.0,82.53,4.485,36513.323,427004.183096,4373.339614,29.442473
53,Denmark,17195.0,626.0,2447911.0,Europe,5792203.0,80.9,2.5,46682.515,422621.755488,2968.645954,108.076323
97,Israel,122539.0,969.0,2353984.0,Asia,8655541.0,82.97,2.99,33132.32,271962.665303,14157.289533,111.951408
89,Iceland,2121.0,10.0,88829.0,Europe,341250.0,82.99,2.91,46482.958,260304.761905,6215.384615,29.304029
158,Russia,1005000.0,17414.0,37176827.0,Europe,145934460.0,72.58,8.05,24765.954,254750.159763,6886.653091,119.327539
200,United States,6114406.0,185744.0,83898416.0,North America,331002647.0,78.86,2.77,54225.446,253467.507769,18472.377957,561.155633
10,Australia,25923.0,663.0,6255797.0,Oceania,25499881.0,83.44,3.84,44648.71,245326.517406,1016.592979,26.000121


#### (Optional) Q: Count number of countries that feature in both the lists of "highest number of tests per million" and "highest number of cases per million".

In [73]:
df1 = df.sort_values('tests_per_million', ascending=False)
df2 = df.sort_values('cases_per_million', ascending=False).head(10)
df2

Unnamed: 0,location,total_cases,total_deaths,total_tests,continent,population,life_expectancy,hospital_beds_per_thousand,gdp_per_capita,tests_per_million,cases_per_million,deaths_per_million
156,Qatar,119206.0,199.0,634745.0,Asia,2881060.0,80.23,1.2,116935.6,220316.48074,41375.74365,69.0718
14,Bahrain,52440.0,190.0,1118837.0,Asia,1701583.0,77.29,2.0,43290.705,657527.137965,30818.36149,111.66073
148,Panama,94084.0,2030.0,336345.0,North America,4314768.0,78.51,2.3,22267.037,77952.04748,21805.112117,470.477208
40,Chile,414739.0,11344.0,2458762.0,South America,19116209.0,80.18,2.11,22767.037,128621.841287,21695.671982,593.4231
163,San Marino,735.0,42.0,,Europe,33938.0,84.97,3.8,56861.47,,21657.13949,1237.550828
9,Aruba,2211.0,12.0,,North America,106766.0,76.29,,35973.781,,20708.839893,112.395332
106,Kuwait,86478.0,535.0,621616.0,Asia,4270563.0,75.49,2.0,65530.537,145558.325682,20249.789079,125.276222
151,Peru,663437.0,29259.0,584232.0,South America,32971846.0,76.74,1.6,12236.706,17719.117092,20121.318048,887.393445
27,Brazil,3997865.0,123780.0,4797948.0,South America,212559409.0,75.88,2.2,14103.452,22572.268255,18808.224105,582.331314
200,United States,6114406.0,185744.0,83898416.0,North America,331002647.0,78.86,2.77,54225.446,253467.507769,18472.377957,561.155633
