<div style="text-align: center; background-color: #5A96E3; font-family: 'Trebuchet MS', Arial, sans-serif; color: white; padding: 20px; font-size: 40px; font-weight: bold; border-radius: 0 0 0 0; box-shadow: 0px 6px 8px rgba(0, 0, 0, 0.2);">
  Data Modeling
</div>

In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

In [122]:
product_df = pd.read_csv('../data/processed/preprocess_QCL.csv')
population_df = pd.read_csv('../data/processed/preprocess_OA.csv')
agri_pop_df = pd.read_csv('../data/processed/preprocess_OEA.csv')
value_df = pd.read_csv('../data/processed/preprocess_QV.csv')

In [123]:
element_df = pd.read_csv('../data/external/element_code.csv')
item_df = pd.read_csv('../data/external/item_code.csv')
indicator_df = pd.read_csv('../data/external/indicator_code.csv')

### Reorganize population_df

In [124]:
population_df = population_df.merge(element_df, how= 'left', on= 'Element Code')
population_df

Unnamed: 0,Domain Code,Area Code,Element Code,Item Code,Year,Unit,Value,Flag,Note,Element
0,OA,237,511,3010,1950,1000 No,25109.200,X,,Total Population - Both sexes (1000 No)
1,OA,237,512,3010,1950,1000 No,12408.031,X,,Total Population - Male (1000 No)
2,OA,237,513,3010,1950,1000 No,12701.169,X,,Total Population - Female (1000 No)
3,OA,237,551,3010,1950,1000 No,21921.843,X,,Rural population (1000 No)
4,OA,237,561,3010,1950,1000 No,2888.063,X,,Urban population (1000 No)
...,...,...,...,...,...,...,...,...,...,...
650,OA,237,512,3010,2099,1000 No,46560.058,X,,Total Population - Male (1000 No)
651,OA,237,513,3010,2099,1000 No,44913.397,X,,Total Population - Female (1000 No)
652,OA,237,511,3010,2100,1000 No,91036.732,X,,Total Population - Both sexes (1000 No)
653,OA,237,512,3010,2100,1000 No,46342.743,X,,Total Population - Male (1000 No)


In [125]:
population_df = population_df.pivot(index= 'Year', columns= 'Element', values= 'Value').reset_index()
population_df

Element,Year,Rural population (1000 No),Total Population - Both sexes (1000 No),Total Population - Female (1000 No),Total Population - Male (1000 No),Urban population (1000 No)
0,1950,21921.843,25109.200,12701.169,12408.031,2888.063
1,1951,22341.053,25627.533,12986.858,12640.675,3023.400
2,1952,22806.351,26189.593,13294.186,12895.406,3170.487
3,1953,23316.645,26820.851,13635.315,13185.537,3329.527
4,1954,23869.466,27507.453,13996.163,13511.290,3501.236
...,...,...,...,...,...,...
146,2096,,92769.682,45571.738,47197.944,
147,2097,,92339.695,45351.840,46987.855,
148,2098,,91907.988,45132.670,46775.318,
149,2099,,91473.455,44913.397,46560.058,


### Reorganize agri_pop_df

In [126]:
agri_pop_df = agri_pop_df.merge(indicator_df, how= 'left', on= 'Indicator Code')
agri_pop_df

Unnamed: 0,Domain Code,Area Code,Indicator Code,Sex Code,Sex,Year,Element Code,Element,Source Code,Unit,Value,Flag,Note,Indicator
0,OEA,237,21144,1,Total,1991,6199,Value,3043,1000 No,24059.27,X,,"Employment in agriculture, forestry and fishin..."
1,OEA,237,21144,1,Total,1992,6199,Value,3043,1000 No,24384.94,X,,"Employment in agriculture, forestry and fishin..."
2,OEA,237,21144,1,Total,1993,6199,Value,3043,1000 No,24641.43,X,,"Employment in agriculture, forestry and fishin..."
3,OEA,237,21144,1,Total,1994,6199,Value,3043,1000 No,24875.18,X,,"Employment in agriculture, forestry and fishin..."
4,OEA,237,21144,1,Total,1995,6199,Value,3043,1000 No,25101.97,X,,"Employment in agriculture, forestry and fishin..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1091,OEA,237,21157,3,Female,2017,6121,Value,3023,%,34.74,X,Repository: ILO-STATISTICS - Micro data proces...,Share of female employees in total employees i...
1092,OEA,237,21157,3,Female,2018,6121,Value,3023,%,35.36,X,Repository: ILO-STATISTICS - Micro data proces...,Share of female employees in total employees i...
1093,OEA,237,21157,3,Female,2019,6121,Value,3023,%,34.69,X,Repository: ILO-STATISTICS - Micro data proces...,Share of female employees in total employees i...
1094,OEA,237,21157,3,Female,2020,6121,Value,3023,%,34.30,X,Repository: ILO-STATISTICS - Micro data proces...,Share of female employees in total employees i...


-   Element and Element Code is duplicate

In [127]:
agri_pop_df = agri_pop_df.drop(columns= ['Element Code', 'Element'])
agri_pop_df

Unnamed: 0,Domain Code,Area Code,Indicator Code,Sex Code,Sex,Year,Source Code,Unit,Value,Flag,Note,Indicator
0,OEA,237,21144,1,Total,1991,3043,1000 No,24059.27,X,,"Employment in agriculture, forestry and fishin..."
1,OEA,237,21144,1,Total,1992,3043,1000 No,24384.94,X,,"Employment in agriculture, forestry and fishin..."
2,OEA,237,21144,1,Total,1993,3043,1000 No,24641.43,X,,"Employment in agriculture, forestry and fishin..."
3,OEA,237,21144,1,Total,1994,3043,1000 No,24875.18,X,,"Employment in agriculture, forestry and fishin..."
4,OEA,237,21144,1,Total,1995,3043,1000 No,25101.97,X,,"Employment in agriculture, forestry and fishin..."
...,...,...,...,...,...,...,...,...,...,...,...,...
1091,OEA,237,21157,3,Female,2017,3023,%,34.74,X,Repository: ILO-STATISTICS - Micro data proces...,Share of female employees in total employees i...
1092,OEA,237,21157,3,Female,2018,3023,%,35.36,X,Repository: ILO-STATISTICS - Micro data proces...,Share of female employees in total employees i...
1093,OEA,237,21157,3,Female,2019,3023,%,34.69,X,Repository: ILO-STATISTICS - Micro data proces...,Share of female employees in total employees i...
1094,OEA,237,21157,3,Female,2020,3023,%,34.30,X,Repository: ILO-STATISTICS - Micro data proces...,Share of female employees in total employees i...


In [128]:
agri_pop_df = agri_pop_df.pivot(index= ['Year', 'Sex'], columns= 'Indicator', values= 'Value').reset_index()
agri_pop_df

Indicator,Year,Sex,Agriculture value added per worker (constant 2015 US$),"Employment in agriculture, forestry and fishing - ILO modelled estimates","Employment in agriculture, forestry and fishing by age, 15 to 24","Employment in agriculture, forestry and fishing by age, 25 to 54","Employment in agriculture, forestry and fishing by age, 55 to 64","Employment in agriculture, forestry and fishing by age, 65+","Employment in agriculture, forestry and fishing by age, total (15+)","Employment in agriculture, forestry and fishing by status of employment, employees",...,"Mean weekly hours actually worked per employed person in agriculture, forestry and fishing","Mean weekly hours actually worked per employee in agriculture, forestry and fishing","Share of employees in agriculture, forestry and fishing in total employees","Share of employment in agriculture, forestry and fishing in total employment","Share of employment in agriculture, forestry and fishing in total employment - ILO Modelled Estimates","Share of employment in crop and animal production, hunting and related service activities",Share of employment in fishing and aquaculture,Share of employment in forestry and logging,"Share of female employees in total employees in agriculture, forestry and fishing","Share of females in total employment in agriculture, forestry and fishing"
0,1991,Female,,12368.90,,,,,,,...,,,,,75.9,,,,,
1,1991,Male,,11695.29,,,,,,,...,,,,,74.2,,,,,
2,1991,Total,557.60,24059.27,,,,,,,...,,,,,75.0,,,,,
3,1992,Female,,12486.15,,,,,,,...,,,,,74.9,,,,,
4,1992,Male,,11903.21,,,,,,,...,,,,,73.4,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,2020,Male,,8859.40,985.68,5410.40,1728.14,779.40,8903.61,1181.00,...,34.12,41.60,8.00,31.6,31.5,83.11,12.39,4.49,,
89,2020,Total,2129.32,17480.94,1737.49,10675.92,3405.62,1571.74,17390.78,1796.84,...,31.85,40.25,6.92,32.6,32.6,88.28,8.17,3.54,,
90,2021,Female,,7449.14,479.25,3967.86,1474.22,608.50,6529.84,639.00,...,30.48,36.35,5.79,25.6,28.7,91.92,5.34,2.73,35.68,45.88
91,2021,Male,,8152.81,771.21,4622.64,1577.48,731.95,7703.27,1152.00,...,34.75,41.36,8.00,27.7,29.3,80.99,14.96,4.04,,


-   Look like column 3

In [129]:
raw_data = product_df[product_df['Item Code'].isin([56])] # Get data of Rice and Maize
raw_data

Unnamed: 0,Domain Code,Area Code,Element Code,Item Code,Year,Unit,Value,Flag,Note
4565,QCL,237,5312,56,1961,ha,260200.00,A,
4566,QCL,237,5419,56,1961,100 g/ha,11230.00,E,
4567,QCL,237,5510,56,1961,t,292200.00,A,
4568,QCL,237,5312,56,1962,ha,262020.00,X,Unofficial figure
4569,QCL,237,5419,56,1962,100 g/ha,11987.00,X,Unofficial figure
...,...,...,...,...,...,...,...,...,...
4743,QCL,237,5419,56,2020,100 g/ha,48513.00,A,
4744,QCL,237,5510,56,2020,t,4558107.16,A,
4745,QCL,237,5312,56,2021,ha,900673.00,A,
4746,QCL,237,5419,56,2021,100 g/ha,49367.00,A,


In [130]:
area_df = raw_data[raw_data['Element Code'] == 5312]
area_df = area_df.drop(columns=['Domain Code', 'Domain Code', 'Area Code', 'Element Code', 'Item Code', 'Unit', 'Flag', 'Note'])
area_df = area_df.rename(columns={'Value' : 'Area Harvest'})
area_df

Unnamed: 0,Year,Area Harvest
4565,1961,260200.0
4568,1962,262020.0
4571,1963,272750.0
4574,1964,257000.0
4577,1965,277400.0
...,...,...
4733,2017,1099274.0
4736,2018,1032598.0
4739,2019,985162.0
4742,2020,939563.0


In [131]:
data = raw_data[raw_data['Element Code'] == 5510]
data = data.merge(population_df, how= 'left', on= 'Year')
data = data.merge(area_df, how= 'left', on= 'Year')

In [132]:
data

Unnamed: 0,Domain Code,Area Code,Element Code,Item Code,Year,Unit,Value,Flag,Note,Rural population (1000 No),Total Population - Both sexes (1000 No),Total Population - Female (1000 No),Total Population - Male (1000 No),Urban population (1000 No),Area Harvest
0,QCL,237,5510,56,1961,t,292200.00,A,,28606.192,33621.982,17086.013,16535.970,5060.580,260200.0
1,QCL,237,5510,56,1962,t,314085.00,X,Unofficial figure,29353.433,34533.889,17548.591,16985.297,5330.732,262020.0
2,QCL,237,5510,56,1963,t,253840.00,X,Unofficial figure,30108.893,35526.727,18053.403,17473.324,5613.198,272750.0
3,QCL,237,5510,56,1964,t,319600.00,X,Unofficial figure,30872.334,36509.166,18554.801,17954.365,5908.651,257000.0
4,QCL,237,5510,56,1965,t,319100.00,A,,31643.158,37466.077,19044.946,18421.131,6216.854,277400.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,QCL,237,5510,56,2017,t,5109765.58,A,,61898.302,94033.048,47620.370,46412.679,33642.498,1099274.0
57,QCL,237,5510,56,2018,t,4874054.16,A,,61832.185,94914.330,48058.062,46856.269,34658.961,1032598.0
58,QCL,237,5510,56,2019,t,4732148.60,A,,61742.331,95776.716,48488.283,47288.433,35686.730,985162.0
59,QCL,237,5510,56,2020,t,4558107.16,A,,61632.897,96648.685,48922.527,47726.158,36727.248,939563.0


### Choose features to train

In [133]:
data = data.drop(columns= ['Domain Code', 'Area Code', 'Element Code', 'Item Code', 'Year', 'Unit', 'Year', 'Flag', 'Note'])
data

Unnamed: 0,Value,Rural population (1000 No),Total Population - Both sexes (1000 No),Total Population - Female (1000 No),Total Population - Male (1000 No),Urban population (1000 No),Area Harvest
0,292200.00,28606.192,33621.982,17086.013,16535.970,5060.580,260200.0
1,314085.00,29353.433,34533.889,17548.591,16985.297,5330.732,262020.0
2,253840.00,30108.893,35526.727,18053.403,17473.324,5613.198,272750.0
3,319600.00,30872.334,36509.166,18554.801,17954.365,5908.651,257000.0
4,319100.00,31643.158,37466.077,19044.946,18421.131,6216.854,277400.0
...,...,...,...,...,...,...,...
56,5109765.58,61898.302,94033.048,47620.370,46412.679,33642.498,1099274.0
57,4874054.16,61832.185,94914.330,48058.062,46856.269,34658.961,1032598.0
58,4732148.60,61742.331,95776.716,48488.283,47288.433,35686.730,985162.0
59,4558107.16,61632.897,96648.685,48922.527,47726.158,36727.248,939563.0


In [134]:
ratio = [0.6, 0.3, 0.1] #ratio for train, validation, and test
data = data.sample(frac= 1)
# data_train, data_validation, data_test = np.split(data, [int(ratio[0]*len(data)), int(ratio[1]*len(data))])
data_train = data.iloc[:int(ratio[0]*len(data)), :]
data_validation = data.iloc[int(ratio[0]*len(data)) : (int(ratio[0]*len(data)) + int(ratio[1]*len(data))), :]
data_test =  data.iloc[(int(ratio[0]*len(data)) + int(ratio[1]*len(data))) :, :]

In [135]:
def get_data():
    X_train = data_train.iloc[:, 1:]
    y_train = data_train.iloc[:, :-5]

    X_validation = data_validation.iloc[:, 1:]
    y_validation = data_validation.iloc[:, :-5]

    X_test = data_test.iloc[:, 1:]
    y_test = data_test.iloc[:, :-5]

    return X_train, y_train, X_validation, y_validation, X_test, y_test


In [136]:
# def main_model_2():

X_train, y_train, X_validation, y_validation,  X_test, y_test = get_data()

lr = LinearRegression()
lr.fit(X_train, y_train)

print(lr.score(X_validation, y_validation))



0.9988915298385794


In [137]:
y_pred = lr.predict(X_test)

In [138]:
mae = mean_absolute_error(y_true=y_test, y_pred= y_pred)
print(mae)

132502.6994776553
