# Load data

Our dataset contains the average GDP per capita of 227 countries from 1970 to 2017 as well as the average of other metrics such as Infant mortality (per 1000 births) and Literacy % for each country

We want our model to be able to predict the GDP per capita for a country based on inputted values for the other metrics.

In [5]:
import pandas as pd

file = 'countries of the world.csv'

df = pd.read_csv(file, decimal = ',')

# Preprocessing for data analysis

In [7]:
# Fills null values with the average column value by region 

for col in df.columns.values:
  if df[col].isnull().sum() == 0: # if column does not have null values
    continue 
  #otherwise
  med_values = df.groupby('Region')[col].median()
  for region in df['Region'].unique():
    # replace value where value is null and df[Region] = region
    df[col].loc[(df[col].isnull()) & (df['Region'] == region)] = med_values[region]

# Data Analysis and Graphs

In [8]:
# min column values with country name

for col in df.columns.values[2:]:
  index_of_min = df[col].idxmin()
  min_country = df['Country'].loc[index_of_min]
  min_value = df[col].min()
  print(col + ", " + min_country + ", " + str(min_value))

Population, St Pierre & Miquelon , 7026
Area (sq. mi.), Monaco , 2
Pop. Density (per sq. mi.), Greenland , 0.0
Coastline (coast/area ratio), Afghanistan , 0.0
Net migration, Micronesia, Fed. St. , -20.99
Infant mortality (per 1000 births), Singapore , 2.29
GDP ($ per capita), East Timor , 500.0
Literacy (%), Niger , 17.6
Phones (per 1000), Congo, Dem. Rep. , 0.2
Arable (%), Anguilla , 0.0
Crops (%), Andorra , 0.0
Other (%), Tonga , 33.33
Climate, Afghanistan , 1.0
Birthrate, Hong Kong , 7.29
Deathrate, N. Mariana Islands , 2.29
Agriculture, Singapore , 0.0
Industry, Jersey , 0.02
Service, Equatorial Guinea , 0.062


In [9]:
# max column values with country name

for col in df.columns.values[2:]:
  index_of_max = df[col].idxmax()
  max_country = df['Country'].loc[index_of_max]
  max_value = df[col].max()
  print(col + ", " + max_country + ", " + str(max_value))

Population, China , 1313973713
Area (sq. mi.), Russia , 17075200
Pop. Density (per sq. mi.), Monaco , 16271.5
Coastline (coast/area ratio), Micronesia, Fed. St. , 870.66
Net migration, Afghanistan , 23.06
Infant mortality (per 1000 births), Angola , 191.19
GDP ($ per capita), Luxembourg , 55100.0
Literacy (%), Andorra , 100.0
Phones (per 1000), Monaco , 1035.6
Arable (%), Bangladesh , 62.11
Crops (%), Kiribati , 50.68
Other (%), Anguilla , 100.0
Climate, Armenia , 4.0
Birthrate, Niger , 50.73
Deathrate, Swaziland , 29.74
Agriculture, Liberia , 0.769
Industry, Equatorial Guinea , 0.906
Service, Cayman Islands , 0.954


In [None]:
import plotly_express as px

corr = df[df.columns.values[2:]].corr()
px.imshow(corr, text_auto=True, width=1300, height=1300)

"""
Correlation map of all features in the dataframe
Higher numbers are a good indicator of important features in the dataset
For example, GDP is highly correlated to Phones per 1000 people
"""

In [None]:
"""
Most of the data in this set is logarithmically correlated
"""

# Preprocessing for Machine Learning

In [10]:
# Make dummies for Region 

dummies = pd.get_dummies(df['Region'])

df['Asia'] = dummies['ASIA (EX. NEAR EAST)         ']
df['Eastern Europe'] = dummies['EASTERN EUROPE                     ']
df['Northern Africa'] = dummies['NORTHERN AFRICA                    ']
df['Oceania'] = dummies['OCEANIA                            ']
df['Western Europe'] = dummies['WESTERN EUROPE                     ']
df['Sub-Saharan Africa'] = dummies['SUB-SAHARAN AFRICA                 ']
df['Latin America and Caribbean'] = dummies['LATIN AMER. & CARIB    ']
df['Commonwealth of Independent States'] = dummies['C.W. OF IND. STATES ']
df['Near East'] = dummies['NEAR EAST                          ']
df['North America'] = dummies['NORTHERN AMERICA                   ']
df['Baltics'] = dummies['BALTICS                            ']

df.drop('Region', axis=1, inplace=True)

In [11]:
# Cannot use for prediction

df.drop('Country', axis=1, inplace=True)

In [12]:
# Split data

target = df['GDP ($ per capita)']
features = df.loc[:, df.columns != 'GDP ($ per capita)']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 21) # random state to keep splits the same in different runs

# Decision Trees

In [None]:
# Description of model (how it works)
# A decision tree 

# Hyperparameter tuning / feature selection

# Train model

# Model performance analysis

# Random Forests

In [None]:
# Description of model (how it works)

# Hyperparameter tuning / feature selection

# Train model

# Model performance analysis

# Stochastic Gradient Descent

In [None]:
# Description of model (how it works)

# Hyperparameter tuning / feature scaling

# Train model

# Model performance analysis