In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("us_foreign_aid_country.csv")
print(data.head(6))

  country_code country_name  region_id         Region Name income_roup  \
0          ABW        Aruba          6  Western Hemisphere         HIC   
1          ABW        Aruba          6  Western Hemisphere         HIC   
2          ABW        Aruba          6  Western Hemisphere         HIC   
3          ABW        Aruba          6  Western Hemisphere         HIC   
4          ABW        Aruba          6  Western Hemisphere         HIC   
5          ABW        Aruba          6  Western Hemisphere         HIC   

     income_group_name  transaction_type_id transaction_type_name fiscal_year  \
0  High Income Country                    2           Obligations        1999   
1  High Income Country                    2           Obligations        2000   
2  High Income Country                    2           Obligations        2004   
3  High Income Country                    2           Obligations        2005   
4  High Income Country                    2           Obligations        200

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21973 entries, 0 to 21972
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   country_code           21914 non-null  object
 1   country_name           21973 non-null  object
 2   region_id              21973 non-null  int64 
 3   Region Name            21973 non-null  object
 4   income_roup            19764 non-null  object
 5   income_group_name      19764 non-null  object
 6   transaction_type_id    21973 non-null  int64 
 7   transaction_type_name  21973 non-null  object
 8   fiscal_year            21973 non-null  object
 9   current_amount         21973 non-null  int64 
 10  constant_amount        21973 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 1.8+ MB


In [4]:
data = data.drop(["country_code","income_group_name"],axis=1)
print(data.head(5))

  country_name  region_id         Region Name income_roup  \
0        Aruba          6  Western Hemisphere         HIC   
1        Aruba          6  Western Hemisphere         HIC   
2        Aruba          6  Western Hemisphere         HIC   
3        Aruba          6  Western Hemisphere         HIC   
4        Aruba          6  Western Hemisphere         HIC   

   transaction_type_id transaction_type_name fiscal_year  current_amount  \
0                    2           Obligations        1999           19000   
1                    2           Obligations        2000           50000   
2                    2           Obligations        2004            1000   
3                    2           Obligations        2005           29270   
4                    2           Obligations        2006            1000   

   constant_amount  
0            28303  
1            72982  
2             1346  
3            38231  
4             1266  


In [5]:
data = data.dropna()

In [6]:
data.isnull().sum()

country_name             0
region_id                0
Region Name              0
income_roup              0
transaction_type_id      0
transaction_type_name    0
fiscal_year              0
current_amount           0
constant_amount          0
dtype: int64

In [7]:
data.shape

(19764, 9)

In [8]:
data["Region Name"].value_counts()

Sub-Saharan Africa              5854
Western Hemisphere              4053
Europe and Eurasia              3680
East Asia and Oceania           2750
Middle East and North Africa    2006
South and Central Asia          1421
Name: Region Name, dtype: int64

In [9]:
data["transaction_type_name"].value_counts()

Obligations                    9737
Disbursements                  3949
Appropriated and Planned       2183
President's Budget Requests    2161
Initial Allocations            1734
Name: transaction_type_name, dtype: int64

In [10]:
Label_mapping= {"Sub-Saharan Africa":6,"Western Hemisphere":1,"Europe and Eurasia":2,"East Asia and Oceania":3,"Middle East and North Africa":4,"South and Central Asia":5}
data["Region Name"]= data["Region Name"].map(Label_mapping)

In [11]:
print(data.head(6))

  country_name  region_id  Region Name income_roup  transaction_type_id  \
0        Aruba          6            1         HIC                    2   
1        Aruba          6            1         HIC                    2   
2        Aruba          6            1         HIC                    2   
3        Aruba          6            1         HIC                    2   
4        Aruba          6            1         HIC                    2   
5        Aruba          6            1         HIC                    2   

  transaction_type_name fiscal_year  current_amount  constant_amount  
0           Obligations        1999           19000            28303  
1           Obligations        2000           50000            72982  
2           Obligations        2004            1000             1346  
3           Obligations        2005           29270            38231  
4           Obligations        2006            1000             1266  
5           Obligations        2007           97

In [12]:
data["Region Name"].value_counts().sum()

19764

In [13]:
from sklearn.preprocessing import LabelEncoder
LB = LabelEncoder()
data["transaction_type_name"] = LB.fit_transform(data["transaction_type_name"])
data["transaction_type_name"].head(6)

0    3
1    3
2    3
3    3
4    3
5    3
Name: transaction_type_name, dtype: int32

In [14]:
data["income_roup"].value_counts()

UMIC    6123
LMIC    5341
HIC     4455
LIC     3845
Name: income_roup, dtype: int64

In [15]:
data["income_roup"] = LB.fit_transform(data["income_roup"])
print(data.head(6))

  country_name  region_id  Region Name  income_roup  transaction_type_id  \
0        Aruba          6            1            0                    2   
1        Aruba          6            1            0                    2   
2        Aruba          6            1            0                    2   
3        Aruba          6            1            0                    2   
4        Aruba          6            1            0                    2   
5        Aruba          6            1            0                    2   

   transaction_type_name fiscal_year  current_amount  constant_amount  
0                      3        1999           19000            28303  
1                      3        2000           50000            72982  
2                      3        2004            1000             1346  
3                      3        2005           29270            38231  
4                      3        2006            1000             1266  
5                      3        200

In [16]:
def year_change(year):
    if year == "1976tq":
        return int(1976)
    else:
        return int(year)
    
data["fiscal_year"] = data["fiscal_year"].apply(year_change)
print(data["fiscal_year"].value_counts())

2009    832
2008    831
2011    829
2010    828
2012    821
       ... 
1946     47
1947     47
1950     44
1949     40
1948     38
Name: fiscal_year, Length: 77, dtype: int64


In [17]:
Y = data["constant_amount"]

In [18]:
X = data.drop(["constant_amount","country_name"],axis=1)

In [19]:
print(X.head(5))

   region_id  Region Name  income_roup  transaction_type_id  \
0          6            1            0                    2   
1          6            1            0                    2   
2          6            1            0                    2   
3          6            1            0                    2   
4          6            1            0                    2   

   transaction_type_name  fiscal_year  current_amount  
0                      3         1999           19000  
1                      3         2000           50000  
2                      3         2004            1000  
3                      3         2005           29270  
4                      3         2006            1000  


In [20]:
x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size=0.2)

In [21]:
x_test

Unnamed: 0,region_id,Region Name,income_roup,transaction_type_id,transaction_type_name,fiscal_year,current_amount
6565,1,3,3,2,3,2005,1758144
7560,5,6,3,2,3,2013,219871
11838,5,6,1,1,0,2020,105700000
1368,5,6,1,19,2,2016,23330000
842,6,1,0,3,1,2008,106000
...,...,...,...,...,...,...,...
14960,4,5,2,2,3,2011,1947137557
16159,3,4,2,19,2,2008,217986000
5545,6,1,3,2,3,1948,600000
7280,5,6,1,18,4,2017,26160000


In [22]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19764 entries, 0 to 21972
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   region_id              19764 non-null  int64
 1   Region Name            19764 non-null  int64
 2   income_roup            19764 non-null  int32
 3   transaction_type_id    19764 non-null  int64
 4   transaction_type_name  19764 non-null  int32
 5   fiscal_year            19764 non-null  int64
 6   current_amount         19764 non-null  int64
dtypes: int32(2), int64(5)
memory usage: 1.1 MB


### **Train a Model**

In [23]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [24]:
def model_acc(model):
    model.fit(x_train,y_train)
    acc = model.score(x_test,y_test)
    print(str(model)+" : "+str(acc*100))


In [25]:
LR = LinearRegression()
lasso = Lasso()
DT = DecisionTreeRegressor()
rf = RandomForestRegressor()

model_acc(LR)
model_acc(lasso)
model_acc(DT)
model_acc(rf)

LinearRegression() : 71.5401402654487


  model = cd_fast.enet_coordinate_descent(


Lasso() : 71.54014026898209
DecisionTreeRegressor() : 99.20944750478957
RandomForestRegressor() : 98.6643512301399


In [28]:
from sklearn.model_selection import GridSearchCV
paramters = {"max_depth":[10,50,100],
             "criterion":["squared_error","absolute_error","poisson"]}


In [29]:
grid_obj = GridSearchCV(estimator=DT,param_grid=paramters)
grid_fit = grid_obj.fit(x_train,y_train)
best_model = grid_fit.best_estimator_
print(best_model)

15 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Programs\Python\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Programs\Python\lib\site-packages\sklearn\tree\_classes.py", line 1342, in fit
    super().fit(
  File "D:\Programs\Python\lib\site-packages\sklearn\tree\_classes.py", line 185, in fit
    raise ValueError(
ValueError: Some value(s) of y are negative which is not allowed for Poisson regression.

        nan        nan        nan]


DecisionTreeRegressor(criterion='absolute_error', max_depth=50)


In [30]:
accu = rf.score(x_test,y_test)
print(accu)


0.986643512301399


In [32]:
pred = best_model.predict([[6,1,0,2,3,1999,19000]])
print(pred)

[28303.]


