# GDP Data Analysis 

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv(r'all_countries.csv')

## 🧾 General Data Overview

1.	What are the column names in the dataset?

In [3]:
df.columns

Index(['Country', 'Region', 'Population', 'Area (sq. mi.)',
       'Pop. Density (per sq. mi.)', 'Coastline (coast/area ratio)',
       'Net migration', 'Infant mortality (per 1000 births)',
       'GDP ($ per capita)', 'Literacy (%)', 'Phones (per 1000)', 'Arable (%)',
       'Crops (%)', 'Other (%)', 'Climate', 'Birthrate', 'Deathrate',
       'Agriculture', 'Industry', 'Service'],
      dtype='object')

In [4]:
df.columns = df.columns.str.replace(r'[(. /)]','_', regex=True).str.replace(r'[$]','dollar', regex=True).str.replace(r'[%]','pct', regex=True).str.replace(r'_+','_',regex=True)

In [5]:
df.columns

Index(['Country', 'Region', 'Population', 'Area_sq_mi_',
       'Pop_Density_per_sq_mi_', 'Coastline_coast_area_ratio_',
       'Net_migration', 'Infant_mortality_per_1000_births_',
       'GDP_dollar_per_capita_', 'Literacy_pct_', 'Phones_per_1000_',
       'Arable_pct_', 'Crops_pct_', 'Other_pct_', 'Climate', 'Birthrate',
       'Deathrate', 'Agriculture', 'Industry', 'Service'],
      dtype='object')

2.	How many rows and columns are present?

In [6]:
s=df.shape
print(f'Rows: {s[0]} ')
print(f'Columns: {s[1]} ')

Rows: 227 
Columns: 20 


3.	Are there any missing or null values in the dataset?

In [7]:
df.isna().sum()

Country                               0
Region                                0
Population                            0
Area_sq_mi_                           0
Pop_Density_per_sq_mi_                0
Coastline_coast_area_ratio_           0
Net_migration                         3
Infant_mortality_per_1000_births_     3
GDP_dollar_per_capita_                1
Literacy_pct_                        18
Phones_per_1000_                      4
Arable_pct_                           2
Crops_pct_                            2
Other_pct_                            2
Climate                              22
Birthrate                             3
Deathrate                             4
Agriculture                          15
Industry                             16
Service                              15
dtype: int64

4.	What are the data types of each column?

In [8]:
df.dtypes

Country                               object
Region                                object
Population                             int64
Area_sq_mi_                            int64
Pop_Density_per_sq_mi_                object
Coastline_coast_area_ratio_           object
Net_migration                         object
Infant_mortality_per_1000_births_     object
GDP_dollar_per_capita_               float64
Literacy_pct_                         object
Phones_per_1000_                      object
Arable_pct_                           object
Crops_pct_                            object
Other_pct_                            object
Climate                               object
Birthrate                             object
Deathrate                             object
Agriculture                           object
Industry                              object
Service                               object
dtype: object

In [9]:
numeric_cols = df.select_dtypes(include = np.number).columns.tolist()
categorical_cols = df.select_dtypes(include = 'object').columns.tolist()

In [10]:
print(f'\nNumeric columns: {numeric_cols}')
print(f'\nCategorical columns: {categorical_cols}')


Numeric columns: ['Population', 'Area_sq_mi_', 'GDP_dollar_per_capita_']

Categorical columns: ['Country', 'Region', 'Pop_Density_per_sq_mi_', 'Coastline_coast_area_ratio_', 'Net_migration', 'Infant_mortality_per_1000_births_', 'Literacy_pct_', 'Phones_per_1000_', 'Arable_pct_', 'Crops_pct_', 'Other_pct_', 'Climate', 'Birthrate', 'Deathrate', 'Agriculture', 'Industry', 'Service']


In [11]:
to_convert = ['Pop_Density_per_sq_mi_', 'Coastline_coast_area_ratio_', 'Net_migration', 'Infant_mortality_per_1000_births_', 'Literacy_pct_',
              'Phones_per_1000_', 'Arable_pct_', 'Crops_pct_', 'Other_pct_', 'Climate', 'Birthrate', 'Deathrate', 'Agriculture', 'Industry', 'Service']
df[to_convert] = df[to_convert].apply(lambda x: x.str.replace(',','.').astype(float))

In [12]:
numeric_cols = df.select_dtypes(include = np.number).columns.tolist()
categorical_cols = df.select_dtypes(include = 'object').columns.tolist()

print(f'\nNumeric columns: {numeric_cols}')
print(f'\nCategorical columns: {categorical_cols}')


Numeric columns: ['Population', 'Area_sq_mi_', 'Pop_Density_per_sq_mi_', 'Coastline_coast_area_ratio_', 'Net_migration', 'Infant_mortality_per_1000_births_', 'GDP_dollar_per_capita_', 'Literacy_pct_', 'Phones_per_1000_', 'Arable_pct_', 'Crops_pct_', 'Other_pct_', 'Climate', 'Birthrate', 'Deathrate', 'Agriculture', 'Industry', 'Service']

Categorical columns: ['Country', 'Region']


In [13]:
df.dtypes

Country                               object
Region                                object
Population                             int64
Area_sq_mi_                            int64
Pop_Density_per_sq_mi_               float64
Coastline_coast_area_ratio_          float64
Net_migration                        float64
Infant_mortality_per_1000_births_    float64
GDP_dollar_per_capita_               float64
Literacy_pct_                        float64
Phones_per_1000_                     float64
Arable_pct_                          float64
Crops_pct_                           float64
Other_pct_                           float64
Climate                              float64
Birthrate                            float64
Deathrate                            float64
Agriculture                          float64
Industry                             float64
Service                              float64
dtype: object

5.	Which columns have numeric values and which are categorical?

In [14]:
numeric_cols = df.select_dtypes(include = np.number).columns.tolist()
categorical_cols = df.select_dtypes(include = 'object').columns.tolist()

print(f'\nNumeric columns: {numeric_cols}')
print(f'\nCategorical columns: {categorical_cols}')


Numeric columns: ['Population', 'Area_sq_mi_', 'Pop_Density_per_sq_mi_', 'Coastline_coast_area_ratio_', 'Net_migration', 'Infant_mortality_per_1000_births_', 'GDP_dollar_per_capita_', 'Literacy_pct_', 'Phones_per_1000_', 'Arable_pct_', 'Crops_pct_', 'Other_pct_', 'Climate', 'Birthrate', 'Deathrate', 'Agriculture', 'Industry', 'Service']

Categorical columns: ['Country', 'Region']


## 🌍 Geographic and Regional Analysis

6.	How many unique regions are represented in the dataset?

In [15]:
df.Region.nunique()

11

In [16]:
df.Region.value_counts().shape[0]

11

7.	Which countries belong to each region?

In [17]:
df.groupby('Region').Country.apply(list).reset_index()

Unnamed: 0,Region,Country
0,ASIA (EX. NEAR EAST),"[Afghanistan , Bangladesh , Bhutan , Brunei , ..."
1,BALTICS,"[Estonia , Latvia , Lithuania ]"
2,C.W. OF IND. STATES,"[Armenia , Azerbaijan , Belarus , Georgia , Ka..."
3,EASTERN EUROPE,"[Albania , Bosnia & Herzegovina , Bulgaria , C..."
4,LATIN AMER. & CARIB,"[Anguilla , Antigua & Barbuda , Argentina , Ar..."
5,NEAR EAST,"[Bahrain , Cyprus , Gaza Strip , Iraq , Israel..."
6,NORTHERN AFRICA,"[Algeria , Egypt , Libya , Morocco , Tunisia ,..."
7,NORTHERN AMERICA,"[Bermuda , Canada , Greenland , St Pierre & Mi..."
8,OCEANIA,"[American Samoa , Australia , Cook Islands , F..."
9,SUB-SAHARAN AFRICA,"[Angola , Benin , Botswana , Burkina Faso , Bu..."


8.	Which region has the highest number of countries?

In [18]:
df.Region.value_counts().idxmax().strip()

'SUB-SAHARAN AFRICA'

9.	What is the total population per region?

In [19]:
df.groupby('Region').Population.sum().apply(lambda x: f'{x/10000000:.2f} Cr').reset_index()

Unnamed: 0,Region,Population
0,ASIA (EX. NEAR EAST),368.80 Cr
1,BALTICS,0.72 Cr
2,C.W. OF IND. STATES,28.01 Cr
3,EASTERN EUROPE,11.99 Cr
4,LATIN AMER. & CARIB,56.18 Cr
5,NEAR EAST,19.51 Cr
6,NORTHERN AFRICA,16.14 Cr
7,NORTHERN AMERICA,33.17 Cr
8,OCEANIA,3.31 Cr
9,SUB-SAHARAN AFRICA,74.94 Cr


10.	Which region has the largest land area in total?

In [20]:
df.groupby('Region')['Area_sq_mi_'].sum().idxmax().strip()

'SUB-SAHARAN AFRICA'

## 🧑‍🤝‍🧑 Population and Density

11.	Which countries have the highest and lowest population?

In [54]:
df.iloc[df.Population.idxmax()].Country

'China '

In [55]:
df.groupby('Country').Population.max().idxmax()

'China '

In [41]:
df.iloc[df.Population.idxmin()].Country

'St Pierre & Miquelon '

In [56]:
df.groupby('Country').Population.min().idxmin()

'St Pierre & Miquelon '

12.	Which countries have the highest and lowest population density?

In [50]:
high_pd = df[df['Pop_Density_per_sq_mi_'] == df['Pop_Density_per_sq_mi_'].max()]
low_pd = df[df['Pop_Density_per_sq_mi_'] == df['Pop_Density_per_sq_mi_'].min()]
print('Highest population density: ')
high_pd[['Country','Pop_Density_per_sq_mi_']]

Highest population density: 


Unnamed: 0,Country,Pop_Density_per_sq_mi_
138,Monaco,16271.5


In [51]:
print('Lowest population density: ')
low_pd[['Country','Pop_Density_per_sq_mi_']]

Lowest population density: 


Unnamed: 0,Country,Pop_Density_per_sq_mi_
80,Greenland,0.0


13.	Is there any country with high population but low density?

In [60]:
df[(df.Population > df.Population.median()) & (df.Pop_Density_per_sq_mi_ < df.Pop_Density_per_sq_mi_.median())][['Country','Population','Pop_Density_per_sq_mi_']]

Unnamed: 0,Country,Population,Pop_Density_per_sq_mi_
0,Afghanistan,31056997,48.0
2,Algeria,32930091,13.8
5,Angola,12127071,9.7
8,Argentina,39921833,14.4
11,Australia,20264082,2.6
...,...,...,...
216,Uzbekistan,27307134,61.0
218,Venezuela,25730435,28.2
224,Yemen,21456188,40.6
225,Zambia,11502010,15.3


14.	Are there countries with similar populations but very different densities?

In [62]:
df.groupby('Population').filter(lambda x: len(x)>1)[['Country','Population','Pop_Density_per_sq_mi_']]

Unnamed: 0,Country,Population,Pop_Density_per_sq_mi_


## 🐣 Birth, Death, and Migration

15.	Which countries have the highest birth rates?

In [72]:
df.sort_values(by = 'Birthrate', ascending = False)[['Country','Birthrate']].head()

Unnamed: 0,Country,Birthrate
151,Niger,50.73
128,Mali,49.82
210,Uganda,47.35
0,Afghanistan,46.6
183,Sierra Leone,45.76


16.	Which countries have the highest death rates?

In [73]:
df.sort_values(by = 'Deathrate', ascending = False)[['Country','Deathrate']].head()

Unnamed: 0,Country,Deathrate
194,Swaziland,29.74
26,Botswana,29.5
116,Lesotho,28.71
5,Angola,24.2
117,Liberia,23.1


17.	Which countries have the highest net migration rates?

In [75]:
df.sort_values(by = 'Net_migration', ascending = False)[['Country','Net_migration']].head()

Unnamed: 0,Country,Net_migration
0,Afghanistan,23.06
38,Cayman Islands,18.75
166,Qatar,16.29
111,Kuwait,14.18
208,Turks & Caicos Is,11.68


18.	Are there any countries with negative net migration?

In [81]:
df[df['Net_migration']<0].sort_values(by = 'Net_migration')[['Country','Net_migration']].head()

Unnamed: 0,Country,Net_migration
136,"Micronesia, Fed. St.",-20.99
3,American Samoa,-20.71
81,Grenada,-13.92
56,Dominica,-13.87
37,Cape Verde,-12.07


## 💸 Economy and Services

19.	What is the range of GDP per capita across countries?

In [83]:
df.groupby('Country').agg({'GDP_dollar_per_capita_': ['max', 'min']})

Unnamed: 0_level_0,GDP_dollar_per_capita_,GDP_dollar_per_capita_
Unnamed: 0_level_1,max,min
Country,Unnamed: 1_level_2,Unnamed: 2_level_2
Afghanistan,700.0,700.0
Albania,4500.0,4500.0
Algeria,6000.0,6000.0
American Samoa,8000.0,8000.0
Andorra,19000.0,19000.0
...,...,...
West Bank,800.0,800.0
Western Sahara,,
Yemen,800.0,800.0
Zambia,800.0,800.0


20.	Which countries have the highest and lowest GDP per capita?

In [86]:
df.sort_values(by='GDP_dollar_per_capita_', ascending=False)[['Country','GDP_dollar_per_capita_']].head(1)

Unnamed: 0,Country,GDP_dollar_per_capita_
121,Luxembourg,55100.0


In [87]:
df.sort_values(by='GDP_dollar_per_capita_', ascending=True)[['Country','GDP_dollar_per_capita_']].head(1)

Unnamed: 0,Country,GDP_dollar_per_capita_
58,East Timor,500.0


21.	Are there countries with high GDP but low literacy?

In [95]:
df[(df['GDP_dollar_per_capita_']> df['GDP_dollar_per_capita_'].median()) & df['Literacy_pct_']< df['Literacy_pct_'].median()][['Country','GDP_dollar_per_capita_','Literacy_pct_']].head(10)

Unnamed: 0,Country,GDP_dollar_per_capita_,Literacy_pct_
0,Afghanistan,700.0,36.0
1,Albania,4500.0,86.5
2,Algeria,6000.0,70.0
3,American Samoa,8000.0,97.0
4,Andorra,19000.0,100.0
5,Angola,1900.0,42.0
6,Anguilla,8600.0,95.0
7,Antigua & Barbuda,11000.0,89.0
8,Argentina,11200.0,97.1
9,Armenia,3500.0,98.6


22.	Which countries have agriculture as the main contributor to their economy?

In [107]:
df[df['Agriculture'] > df[['Industry','Service']].max(axis=1)][['Country','Agriculture','Industry','Service']]

Unnamed: 0,Country,Agriculture,Industry,Service
32,Burma,0.564,0.082,0.353
33,Burundi,0.463,0.203,0.334
35,Cameroon,0.448,0.17,0.382
39,Central African Rep.,0.55,0.2,0.25
45,"Congo, Dem. Rep.",0.55,0.11,0.34
65,Ethiopia,0.475,0.099,0.426
87,Guinea-Bissau,0.62,0.12,0.26
113,Laos,0.455,0.287,0.258
117,Liberia,0.769,0.054,0.177
128,Mali,0.45,0.17,0.38


23.	Which countries are more service-oriented?

In [108]:
df[df['Service'] > df[['Industry','Agriculture']].max(axis=1)][['Country','Service','Industry','Agriculture']]

Unnamed: 0,Country,Service,Industry,Agriculture
1,Albania,0.579,0.188,0.232
6,Anguilla,0.780,0.180,0.040
7,Antigua & Barbuda,0.743,0.220,0.038
8,Argentina,0.547,0.358,0.095
9,Armenia,0.418,0.343,0.239
...,...,...,...,...
218,Venezuela,0.541,0.419,0.040
220,Virgin Islands,0.800,0.190,0.010
222,West Bank,0.630,0.280,0.090
225,Zambia,0.489,0.290,0.220


## 👶 Health and Literacy

24.	Which countries have the highest infant mortality rates?

In [112]:
df.sort_values(by = 'Infant_mortality_per_1000_births_', ascending=False)[['Country','Infant_mortality_per_1000_births_']].head(5)

Unnamed: 0,Country,Infant_mortality_per_1000_births_
5,Angola,191.19
0,Afghanistan,163.07
183,Sierra Leone,143.64
142,Mozambique,130.79
117,Liberia,128.87


25.	Are there countries with low GDP but high literacy rates?

In [120]:
df[(df['GDP_dollar_per_capita_'] < df['GDP_dollar_per_capita_'].median()) & (df['Literacy_pct_'] > df['Literacy_pct_'].median())][['Country','GDP_dollar_per_capita_','Literacy_pct_']].head(10)

Unnamed: 0,Country,GDP_dollar_per_capita_,Literacy_pct_
9,Armenia,3500.0,98.6
13,Azerbaijan,3400.0,97.0
20,Belize,4900.0,94.1
47,Cook Islands,5000.0,95.0
51,Cuba,2900.0,97.0
56,Dominica,5400.0,94.0
75,Georgia,2500.0,99.0
81,Grenada,5000.0,98.0
88,Guyana,4000.0,98.8
109,"Korea, North",1300.0,99.0


26.	Which countries have both high infant mortality and low literacy?

In [121]:
df[(df['Literacy_pct_'] < df['Literacy_pct_'].median()) & (df['Infant_mortality_per_1000_births_'] > df['Infant_mortality_per_1000_births_'].median())][['Country','Infant_mortality_per_1000_births_','Literacy_pct_']].head(10)

Unnamed: 0,Country,Infant_mortality_per_1000_births_,Literacy_pct_
0,Afghanistan,163.07,36.0
1,Albania,21.52,86.5
2,Algeria,31.0,70.0
5,Angola,191.19,42.0
16,Bangladesh,62.6,43.1
21,Benin,85.0,40.9
23,Bhutan,100.44,42.2
24,Bolivia,53.11,87.2
26,Botswana,54.58,79.8
27,Brazil,29.61,86.4


27.	What are the literacy rates per region?

In [126]:
pd.pivot_table(df, values = 'Literacy_pct_', index = 'Region')

Unnamed: 0_level_0,Literacy_pct_
Region,Unnamed: 1_level_1
ASIA (EX. NEAR EAST),79.553571
BALTICS,99.733333
C.W. OF IND. STATES,98.725
EASTERN EUROPE,97.088889
LATIN AMER. & CARIB,90.654545
NEAR EAST,79.521429
NORTHERN AFRICA,67.24
NORTHERN AMERICA,97.75
OCEANIA,88.835294
SUB-SAHARAN AFRICA,62.51


## 📱 Technology and Communication

28.	Which countries have the highest phone penetration per 1000 people?

In [129]:
df.sort_values(by = 'Phones_per_1000_', ascending= False)[['Country', 'Phones_per_1000_']].head()

Unnamed: 0,Country,Phones_per_1000_
138,Monaco,1035.6
214,United States,898.0
78,Gibraltar,877.7
22,Bermuda,851.4
85,Guernsey,842.4


29.	Are there any countries with very low phone penetration?

In [130]:
df.sort_values(by = 'Phones_per_1000_', ascending= True)[['Country', 'Phones_per_1000_']].head()

Unnamed: 0,Country,Phones_per_1000_
45,"Congo, Dem. Rep.",0.2
40,Chad,1.3
151,Niger,1.9
39,Central African Rep.,2.3
117,Liberia,2.3


30.	Is there a relationship between phone penetration and GDP?

In [134]:
df[['GDP_dollar_per_capita_','Phones_per_1000_']].corr()

Unnamed: 0,GDP_dollar_per_capita_,Phones_per_1000_
GDP_dollar_per_capita_,1.0,0.834499
Phones_per_1000_,0.834499,1.0


## 🌾 Land Usage and Climate

31.	Which countries have the highest percentage of arable land?

In [136]:
df.sort_values(by = 'Arable_pct_', ascending= False)[['Country', 'Arable_pct_']].head()

Unnamed: 0,Country,Arable_pct_
16,Bangladesh,62.11
211,Ukraine,56.21
137,Moldova,55.3
94,India,54.4
54,Denmark,54.02


32.	Are there any countries with 0% arable land?

In [139]:
df[df['Arable_pct_'] == 0][['Country','Arable_pct_']].head()

Unnamed: 0,Country,Arable_pct_
6,Anguilla,0.0
78,Gibraltar,0.0
80,Greenland,0.0
104,Jersey,0.0
122,Macau,0.0


33.	What climate types exist in the dataset and how many countries are in each?

In [145]:
df.Climate.value_counts().reset_index()

Unnamed: 0,index,Climate
0,2.0,111
1,3.0,48
2,1.0,29
3,1.5,8
4,4.0,6
5,2.5,3


34.	Do countries with more arable land tend to have higher birthrates?

In [146]:
df[['Arable_pct_','Birthrate']].corr()

Unnamed: 0,Arable_pct_,Birthrate
Arable_pct_,1.0,-0.186823
Birthrate,-0.186823,1.0


## 🧩 Data Quality and Consistency

35.	Are there any duplicate rows in the dataset?

In [147]:
df.duplicated().sum()

0

36.	Are there outliers in any numeric columns (e.g., very high population density)?

In [182]:
def has_outlier(df,col):
    q1= df[col].quantile(0.25)
    q3= df[col].quantile(0.75)
    iqr= q3 - q1
    lower_bound = q1 - 1.5*iqr
    upper_bound = q3 +  1.5*iqr
    return f'Number of outliers in {col}: {((df[col] < lower_bound) | (df[col] > upper_bound)).sum()}'

In [186]:
q1= df.Pop_Density_per_sq_mi_.quantile(0.25)
q3= df.Pop_Density_per_sq_mi_.quantile(0.75)
iqr= q3 - q1
lower_bound = q1 - 1.5*iqr
upper_bound = q3 +  1.5*iqr
((df.Pop_Density_per_sq_mi_ < lower_bound) | (df.Pop_Density_per_sq_mi_ > upper_bound)).sum()

21

In [185]:
has_outlier(df, 'Pop_Density_per_sq_mi_')

'Number of outliers in Pop_Density_per_sq_mi_: 21'

In [190]:
for col in numeric_cols:
    print(has_outlier(df, str(col)))

Number of outliers in Population: 28
Number of outliers in Area_sq_mi_: 28
Number of outliers in Pop_Density_per_sq_mi_: 21
Number of outliers in Coastline_coast_area_ratio_: 37
Number of outliers in Net_migration: 50
Number of outliers in Infant_mortality_per_1000_births_: 5
Number of outliers in GDP_dollar_per_capita_: 3
Number of outliers in Literacy_pct_: 2
Number of outliers in Phones_per_1000_: 1
Number of outliers in Arable_pct_: 10
Number of outliers in Crops_pct_: 29
Number of outliers in Other_pct_: 3
Number of outliers in Climate: 0
Number of outliers in Birthrate: 0
Number of outliers in Deathrate: 16
Number of outliers in Agriculture: 6
Number of outliers in Industry: 11
Number of outliers in Service: 0


37.	Are the percentage values in "Arable", "Crops", and "Other" columns adding up to 100%?

In [205]:
df['pct_sum'] = (df[['Arable_pct_','Crops_pct_','Other_pct_']].sum(axis=1))
print('Columns where percentage values in "Arable", "Crops", and "Other" columns do not add up to 100%: ')
df[df['pct_sum'] != 100.00][['Country','Arable_pct_','Crops_pct_','Other_pct_']]

Columns where percentage values in "Arable", "Crops", and "Other" columns do not add up to 100%: 


Unnamed: 0,Country,Arable_pct_,Crops_pct_,Other_pct_
27,Brazil,6.96,0.9,92.15
50,Croatia,26.09,2.27,71.65
85,Guernsey,,,
124,Madagascar,5.07,1.03,93.91
134,Mayotte,,,
155,Oman,0.0,0.14,99.74
225,Zambia,7.08,0.03,92.9


38.	Are all country names unique and correctly spelled/formatted?

In [207]:
df.shape[0] == df.Country.nunique()

True