# Challenge 3

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import warnings

import warnings
warnings.filterwarnings('ignore')

## Loading train/test data sets

In [2]:
train_df = pd.read_csv('population_training_kaggle.csv', encoding='cp1252')
test_df  = pd.read_csv('population_testing_kaggle.csv', encoding='cp1252')

train_data = train_df
test_data = test_df

In [3]:
print(train_data.shape)
print(test_data.shape)

(40, 259)
(17, 259)


#### Drop 'Id' column from data sets

In [4]:
train_data.drop(['Id'], axis=1,inplace=True)
train_data.head()

Unnamed: 0,Aruba,Afghanistan,Angola,Albania,Andorra,Arab World,United Arab Emirates,Argentina,Armenia,American Samoa,...,Virgin Islands (U.S.),Vietnam,Vanuatu,World,Samoa,Kosovo,"Yemen, Rep.",South Africa,Zambia,Zimbabwe
0,54211,8996351,5643182,1608800,13411,92490932,92634,20619075,1874120,20013,...,32500,32670629,63699,3032160395,108646,947000,5172135,17456855,3044846,3747369
1,55438,9166764,5753024,1659800,14375,95044497,101078,20953077,1941491,20486,...,34300,33666772,65713,3073368588,112119,966000,5260501,17920673,3140264,3870756
2,56225,9345868,5866061,1711319,15370,97682294,112472,21287682,2009526,21117,...,35000,34684165,67808,3126509808,115788,994000,5351799,18401608,3240587,3999419
3,56695,9533954,5980417,1762621,16412,100411076,125566,21621840,2077575,21882,...,39800,35722091,69964,3191786431,119561,1022000,5446063,18899275,3345145,4132756
4,57032,9731361,6093321,1814135,17469,103239902,138529,21953929,2144998,22698,...,40800,36780985,72131,3257459749,123354,1050000,5543339,19412975,3452942,4269863


In [5]:
#get list of Ids (for future reference)
id_list = test_data['Id']
test_data.drop(['Id'], axis=1, inplace=True)
test_data.head()

Unnamed: 0,Aruba,Afghanistan,Angola,Albania,Andorra,Arab World,United Arab Emirates,Argentina,Armenia,American Samoa,...,Virgin Islands (U.S.),Vietnam,Vanuatu,World,Samoa,Kosovo,"Yemen, Rep.",South Africa,Zambia,Zimbabwe
0,90853,20093756,16440924,3089027,65390,283832016,3154925,37057452,3069588,57521,...,108642,80285562,185063,6121682741,174610,1700000,17874725,45728315,10531221,12222251
1,92898,20966463,16983266,3060173,67341,289850357,3326032,37471509,3050655,58175,...,108549,81139919,189290,6201340848,175566,1701154,18390135,46385006,10824125,12366165
2,94992,21979923,17572649,3051010,70049,296026575,3507232,37889370,3033897,58731,...,108510,81956496,193956,6280531847,176582,1702310,18919179,47026173,11120409,12500525
3,97017,23064851,18203369,3039616,73182,302434519,3741932,38309379,3017806,59117,...,108506,82747662,198964,6359901308,177662,1703466,19462086,47648727,11421984,12633897
4,98737,24118979,18865716,3026939,76244,309162029,4087931,38728696,3000612,59264,...,108467,83527678,204143,6439827446,178781,1704622,20017068,48247395,11731746,12777511


#### Define range of values to be used for alpha parameter of Lasso

In [6]:
alphas = 10**np.linspace(10,1,100)*0.5
alphas

array([5.00000000e+09, 4.05565415e+09, 3.28966612e+09, 2.66834962e+09,
       2.16438064e+09, 1.75559587e+09, 1.42401793e+09, 1.15506485e+09,
       9.36908711e+08, 7.59955541e+08, 6.16423370e+08, 5.00000000e+08,
       4.05565415e+08, 3.28966612e+08, 2.66834962e+08, 2.16438064e+08,
       1.75559587e+08, 1.42401793e+08, 1.15506485e+08, 9.36908711e+07,
       7.59955541e+07, 6.16423370e+07, 5.00000000e+07, 4.05565415e+07,
       3.28966612e+07, 2.66834962e+07, 2.16438064e+07, 1.75559587e+07,
       1.42401793e+07, 1.15506485e+07, 9.36908711e+06, 7.59955541e+06,
       6.16423370e+06, 5.00000000e+06, 4.05565415e+06, 3.28966612e+06,
       2.66834962e+06, 2.16438064e+06, 1.75559587e+06, 1.42401793e+06,
       1.15506485e+06, 9.36908711e+05, 7.59955541e+05, 6.16423370e+05,
       5.00000000e+05, 4.05565415e+05, 3.28966612e+05, 2.66834962e+05,
       2.16438064e+05, 1.75559587e+05, 1.42401793e+05, 1.15506485e+05,
       9.36908711e+04, 7.59955541e+04, 6.16423370e+04, 5.00000000e+04,
      

## Computation of coefficients

In [7]:
# define empty list for final coefficients
coef_array = []

#define regularization = Lasso
reg_fit = Lasso(fit_intercept=True, normalize=True)

#list for population prediction
pred_list=[]

for curr_country in range(0,258):
   
    # label for training would be current country population
    y_training = train_data[train_data.columns[curr_country]]
    
    # dropping current country from training set
    x_training = train_data.drop(train_data.columns[[curr_country]], axis=1, inplace=False)
    
    y_testing = test_data[test_data.columns[curr_country]]
    
    # dropping current country from testing set
    x_testing = test_data.drop(test_data.columns[[curr_country]], axis=1, inplace=False)
    
    # define minimum value for error (to be used in calculating best coefficients)
    minimum = float('inf')
    best_coef = 0
    #best_coef_arr = 0
    
    for a in alphas:
        # try for all values of alpha
        reg_fit.set_params(alpha = a)
        
        # fit for training data
        reg_fit.fit(x_training, y_training)
        
        # get prediction data
        prediction = reg_fit.predict(x_testing)
        
        # get error between predicted and actual value
        error = mean_squared_error(prediction,y_testing)
        
        # count number of non-zero coeff (we want 5)
        count = np.count_nonzero(reg_fit.coef_)
        
        # compute best coefficients only if 5 possible
        if count == 5:
            # check if error less than previously computed min value
            if error < minimum:
                
                # get coefficients
                best_coef = reg_fit.coef_
                
                minimum = error
                
                # predict for these coefficients
                prediction = reg_fit.predict(x_testing)
        
                # make current country coefficient to 0
                loc_arr = best_coef
                # insert 0 at current location
                best_coef_arr = np.insert(loc_arr,curr_country,0)
    
    # append best coeff and predicted values
    coef_array.append(best_coef_arr) 
    pred_list.append(prediction)    

### Getting parameters

In [8]:
coef_final = np.array(coef_array)
print(coef_final.shape)

(258, 258)


In [9]:
# get list of countries
countries = train_df.columns

In [10]:
# convert to pd datafrane
df_coef = pd.DataFrame(data=coef_array, columns=countries, index=countries)

# dump to csv
df_coef.to_csv('population_parameters.csv', encoding='cp1252')

### Getting predictions

In [11]:
pred_array = np.array(pred_list)
print(pred_array.shape)

(258, 17)


In [12]:
# convert to pd dataframe
df = pd.DataFrame(data=pred_list)

# take transpose for accurate matching
df2 = df.transpose()

# create empty df for storing final values
df_final = pd.DataFrame(id_list, columns=['Id'])

#add cols for all other countries
for i in range(len(test_df.columns)):
    colname = test_df.columns[i]
    df_final[colname] = df2[i]

print(df_final.shape)

(17, 259)


In [13]:
df_final.head(5)

Unnamed: 0,Id,Aruba,Afghanistan,Angola,Albania,Andorra,Arab World,United Arab Emirates,Argentina,Armenia,...,Virgin Islands (U.S.),Vietnam,Vanuatu,World,Samoa,Kosovo,"Yemen, Rep.",South Africa,Zambia,Zimbabwe
0,2000,91271.637609,19933540.0,16453880.0,3115216.0,64891.339481,285735100.0,3136563.0,37236950.0,3090334.0,...,108672.258829,80945920.0,185359.971676,6156359000.0,173404.89228,1572155.0,17991640.0,46085000.0,10585210.0,12345480.0
1,2001,94394.93055,20454250.0,16821930.0,3114614.0,65995.234507,290931200.0,3259063.0,37561420.0,3041739.0,...,108378.352937,81715250.0,190072.017272,6218906000.0,173805.977337,1383665.0,18507520.0,46654610.0,10811550.0,12494300.0
2,2002,97941.754381,20990360.0,17144370.0,3133303.0,67426.226403,295535700.0,3374370.0,37866060.0,3002664.0,...,109985.508359,82450070.0,195540.799805,6280341000.0,175285.135492,1299784.0,18962470.0,47187820.0,11015510.0,12636660.0
3,2003,101668.834188,21521090.0,17449930.0,3190093.0,68974.358613,301622600.0,3487015.0,38335130.0,3002223.0,...,112124.892538,83716960.0,201395.115284,6374512000.0,177360.96519,1302740.0,19429950.0,48020850.0,11272800.0,12869810.0
4,2004,105331.431918,21948090.0,17698680.0,3237291.0,70777.876071,306490200.0,3587671.0,38702260.0,2994786.0,...,114138.199006,84694070.0,207482.935092,6451403000.0,179528.776396,1304733.0,19827370.0,48686660.0,11491660.0,13053390.0


In [14]:
# dump to csv
df_final.to_csv('population_prediction.csv',encoding='cp1252', index=False)