# Deforestation Data Wrangle (RC2)

## Imports.

In [0]:
# imports.
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import seaborn as sns
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [0]:
# read the the data files.
forest = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/target/Forestarea(%25land_area).csv', skiprows= 3)

mining = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/features/Oresandmetalsexports(%25ofmerchandiseexports).csv', skiprows=3)
livestock = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/features/Livestockproductionindex(2004-2006%3D100).csv', skiprows=3)
agriculture = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/features/Agriculturalland(sq.km).csv', skiprows=3)
population = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/features/UrbanPopulationTotal.csv', skiprows=3)
gdp = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/features/GDPpercapitagrowth(annual%20%25).csv', skiprows=3)
electricity = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/features/Electricpowerconsumption(kWhpercapita).csv', skiprows=3)
crops = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/features/Cropproductionindex(2004-2006%3D100).csv', skiprows=3)
food = pd.read_csv('https://raw.githubusercontent.com/CVanchieri/DataSets/master/WorldBankDeforestation/features/Foodproductionindex(2004-2006%3D100).csv', skiprows=3)

## 1st Wrangle Cleaning.

In [0]:
# 1st wrangle for cleaning.
def wrangle(df):
  df.drop(df.iloc[:, 3:34], inplace=True, axis=1)
  df = df.drop(columns=['2019', 'Unnamed: 64'])
  
  for col in df.select_dtypes(include=np.number):
    df[col] = df[col].fillna(df[col].median())

  df = df.fillna(method='bfill',  axis= 1)
  df = df.fillna(method='ffill', axis= 1)
  year = map(str, range(1990, 2019))
  feature = df.iloc[0][2]
  df = pd.concat([pd.melt(df, id_vars=['Country Code'], value_vars=val, var_name='Year', value_name=feature) for val in year])
  
  return(df)

In [0]:
# wrangle the data.
forest = wrangle(forest)

agriculture = wrangle(agriculture)
electricity = wrangle(electricity)
gdp = wrangle(gdp)
livestock = wrangle(livestock)
mining = wrangle(mining)
population = wrangle(population)
crops = wrangle(crops)
food = wrangle(food)

feature_dfs = [agriculture, gdp, livestock, population, crops, food, mining, electricity]

In [0]:
# merge the data files.
def merge_features(list_dfs):
  train = list_dfs.pop(0)

  for df in list_dfs:
    train = train.merge(df, on=['Country Code', 'Year']) 
    
  return(train)

In [0]:
# merge with forest.
features = merge_features(feature_dfs)
train = features.merge(forest, on=['Country Code', 'Year'])

print(train.shape)
train.head()

(7656, 11)


Unnamed: 0,Country Code,Year,Agricultural land (sq. km),GDP per capita growth (annual %),Livestock production index (2004-2006 = 100),Urban population,Crop production index (2004-2006 = 100),Food production index (2004-2006 = 100),Ores and metals exports (% of merchandise exports),Electric power consumption (kWh per capita),Forest area (% of land area)
0,ABW,1990,20.0,2.09,67.49,31273.0,71.69,69.36,1.1,1237.52,2.33
1,AFG,1990,380400.0,1.32,70.69,2628554.0,66.64,68.12,2.75,1237.52,2.07
2,AGO,1990,574040.0,-6.66,70.11,4400964.0,29.25,37.91,6.22,53.17,48.91
3,ALB,1990,11210.0,-11.19,57.97,1197222.0,84.36,68.73,2.75,552.25,28.79
4,AND,1990,230.0,-0.14,67.49,51627.0,71.69,69.36,2.75,1237.52,34.04


In [0]:
# Download the csv.
from google.colab import files
train.to_csv('WorldBank_1990_2018.csv')
files.download('WorldBank_1990_2018.csv')

## 2nd Wrangle Predictions Dataframe.

In [0]:
# 2nd wrangle to make predictions data frame.
def predicitons_df(df):
  model = LinearRegression()

  codes = df['Country Code'].unique()
  years = [year for year in range(2019, 2121)]
  rows = []
  feature = df.columns.tolist()[2]
  
  for code in codes:
      dictionary = {'Country Code': code}
      model.fit(df[df['Country Code'] == code][['Year']],
                df[df['Country Code'] == code][feature])
      
      for year in years:
          prediction = model.predict([[year]])
          dictionary[str(year)] = prediction[0]
      rows.append(dictionary)
  df_predictions = pd.DataFrame(rows)
  df_predictions = df_predictions[
      ['Country Code'] + [str(year) for year in years]]
  year = map(str, range(2019, 2121))
  df_predictions = pd.concat([pd.melt(df_predictions, id_vars=['Country Code'], value_vars=val, var_name='Year', value_name=feature) for val in year])

  return(df_predictions)

In [0]:
# wrangle the data.
agriculture_pred = predicitons_df(agriculture)
electricity_pred = predicitons_df(electricity)
gdp_pred = predicitons_df(gdp)
livestock_pred = predicitons_df(livestock)
mining_pred = predicitons_df(mining)
population_pred = predicitons_df(population)
crops_pred = predicitons_df(crops)
food_pred = predicitons_df(food)
forest_pred = predicitons_df(forest)

feature_dfs_pred = [agriculture_pred, gdp_pred, livestock_pred, population_pred, crops_pred, food_pred, mining_pred, electricity_pred]

In [0]:
# merge the data files.
def merge_pred_features(list_dfs_pred):
  test = list_dfs_pred.pop(0)

  for df in list_dfs_pred:
    test = test.merge(df, on=['Country Code', 'Year'])
  return(test)

In [0]:
# merge with forest.
features = merge_pred_features(feature_dfs_pred)
test = features.merge(forest_pred, on=['Country Code', 'Year'])

print(test.shape)
test.head()

(26928, 11)


Unnamed: 0,Country Code,Year,Agricultural land (sq. km),GDP per capita growth (annual %),Livestock production index (2004-2006 = 100),Urban population,Crop production index (2004-2006 = 100),Food production index (2004-2006 = 100),Ores and metals exports (% of merchandise exports),Electric power consumption (kWh per capita),Forest area (% of land area)
0,ABW,2019,20.0,-0.75,123.79,48057.97,123.08,125.53,6.37,2712.14,2.33
1,AFG,2019,378959.53,3.31,108.26,9262840.16,148.75,130.08,2.05,2712.14,2.07
2,AGO,2019,591467.93,2.39,146.48,19384636.41,226.44,208.66,2.28,308.43,46.05
3,ALB,2019,11904.29,6.65,120.6,1709176.32,178.59,151.17,12.88,2608.4,28.14
4,AND,2019,184.99,0.81,123.79,75938.01,123.08,125.53,3.59,2712.14,34.04


In [0]:
# Download the csv file.
from google.colab import files
test.to_csv('WorldBank_2019_2120.csv')
files.download('WorldBank_2019_2120.csv')