# Challenge 3

In [22]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
import warnings

import warnings
warnings.filterwarnings('ignore')

## Loading train/test data sets

In [23]:
train_df = pd.read_csv('population_training_kaggle.csv', encoding='cp1252')
test_df  = pd.read_csv('population_testing_kaggle.csv', encoding='cp1252')

train_data = train_df
test_data = test_df

In [24]:
print(train_data.shape)
print(test_data.shape)

(40, 259)
(17, 259)


#### Drop 'Id' column from data sets

In [25]:
train_data.drop(['Id'], axis=1,inplace=True)
train_data.head()

Unnamed: 0,Aruba,Afghanistan,Angola,Albania,Andorra,Arab World,United Arab Emirates,Argentina,Armenia,American Samoa,...,Virgin Islands (U.S.),Vietnam,Vanuatu,World,Samoa,Kosovo,"Yemen, Rep.",South Africa,Zambia,Zimbabwe
0,54211,8996351,5643182,1608800,13411,92490932,92634,20619075,1874120,20013,...,32500,32670629,63699,3032160395,108646,947000,5172135,17456855,3044846,3747369
1,55438,9166764,5753024,1659800,14375,95044497,101078,20953077,1941491,20486,...,34300,33666772,65713,3073368588,112119,966000,5260501,17920673,3140264,3870756
2,56225,9345868,5866061,1711319,15370,97682294,112472,21287682,2009526,21117,...,35000,34684165,67808,3126509808,115788,994000,5351799,18401608,3240587,3999419
3,56695,9533954,5980417,1762621,16412,100411076,125566,21621840,2077575,21882,...,39800,35722091,69964,3191786431,119561,1022000,5446063,18899275,3345145,4132756
4,57032,9731361,6093321,1814135,17469,103239902,138529,21953929,2144998,22698,...,40800,36780985,72131,3257459749,123354,1050000,5543339,19412975,3452942,4269863


In [26]:
#get list of Ids (for future reference)
id_list = test_data['Id']
test_data.drop(['Id'], axis=1, inplace=True)
test_data.head()

Unnamed: 0,Aruba,Afghanistan,Angola,Albania,Andorra,Arab World,United Arab Emirates,Argentina,Armenia,American Samoa,...,Virgin Islands (U.S.),Vietnam,Vanuatu,World,Samoa,Kosovo,"Yemen, Rep.",South Africa,Zambia,Zimbabwe
0,90853,20093756,16440924,3089027,65390,283832016,3154925,37057452,3069588,57521,...,108642,80285562,185063,6121682741,174610,1700000,17874725,45728315,10531221,12222251
1,92898,20966463,16983266,3060173,67341,289850357,3326032,37471509,3050655,58175,...,108549,81139919,189290,6201340848,175566,1701154,18390135,46385006,10824125,12366165
2,94992,21979923,17572649,3051010,70049,296026575,3507232,37889370,3033897,58731,...,108510,81956496,193956,6280531847,176582,1702310,18919179,47026173,11120409,12500525
3,97017,23064851,18203369,3039616,73182,302434519,3741932,38309379,3017806,59117,...,108506,82747662,198964,6359901308,177662,1703466,19462086,47648727,11421984,12633897
4,98737,24118979,18865716,3026939,76244,309162029,4087931,38728696,3000612,59264,...,108467,83527678,204143,6439827446,178781,1704622,20017068,48247395,11731746,12777511


#### Define range of values to be used for alpha parameter of Lasso

In [27]:
alphas = 10**np.linspace(11,-2,200)*0.5
#alphas = 10**np.linspace(10,-2,5)*0.5
alphas

array([5.00000000e+10, 4.30173221e+10, 3.70098000e+10, 3.18412497e+10,
       2.73945059e+10, 2.35687657e+10, 2.02773037e+10, 1.74455061e+10,
       1.50091791e+10, 1.29130938e+10, 1.11097343e+10, 9.55822038e+09,
       8.22338089e+09, 7.07495649e+09, 6.08691364e+09, 5.23685449e+09,
       4.50550913e+09, 3.87629874e+09, 3.33495983e+09, 2.86922082e+09,
       2.46852393e+09, 2.12378578e+09, 1.82719154e+09, 1.57201774e+09,
       1.35247987e+09, 1.16360124e+09, 1.00110019e+09, 8.61292983e+08,
       7.41010353e+08, 6.37525620e+08, 5.48492899e+08, 4.71893914e+08,
       4.05992250e+08, 3.49293987e+08, 3.00513839e+08, 2.58546012e+08,
       2.22439142e+08, 1.91374724e+08, 1.64648563e+08, 1.41654805e+08,
       1.21872208e+08, 1.04852320e+08, 9.02093205e+07, 7.76112679e+07,
       6.67725781e+07, 5.74475500e+07, 4.94247952e+07, 4.25224467e+07,
       3.65840357e+07, 3.14749450e+07, 2.70793569e+07, 2.32976283e+07,
       2.00440316e+07, 1.72448113e+07, 1.48365120e+07, 1.27645403e+07,
      

## Computation of coefficients

In [None]:
# define empty list for final coefficients
coef_array = []

#define regularization = Lasso
reg_fit = Lasso(fit_intercept=True, normalize=True)

#list for population prediction
pred_list=[]

for curr_country in range(0,258):
   
    # label for training would be current country population
    y_training = train_data[train_data.columns[curr_country]]
    
    # dropping current country from training set
    x_training = train_data.drop(train_data.columns[[curr_country]], axis=1, inplace=False)
    
    y_testing = test_data[test_data.columns[curr_country]]
    
    # dropping current country from testing set
    x_testing = test_data.drop(test_data.columns[[curr_country]], axis=1, inplace=False)
    
    # define minimum value for error (to be used in calculating best coefficients)
    minimum = float('inf')
    best_coef = 0
    #best_coef_arr = 0
    
    for a in alphas:
        # try for all values of alpha
        reg_fit.set_params(alpha = a)
        
        # fit for training data
        reg_fit.fit(x_training, y_training)
        
        # get prediction data
        prediction = reg_fit.predict(x_testing)
        
        # get error between predicted and actual value
        error = mean_squared_error(prediction,y_testing)
        
        # count number of non-zero coeff (we want 5)
        count = np.count_nonzero(reg_fit.coef_)
        
        # compute best coefficients only if 5 possible
        if count == 5:
            # check if error less than previously computed min value
            if error < minimum:
                
                # get coefficients
                best_coef = reg_fit.coef_
                
                minimum = error
                
                # predict for these coefficients
                prediction = reg_fit.predict(x_testing)
        
                # make current country coefficient to 0
                loc_arr = best_coef
                # insert 0 at current location
                best_coef_arr = np.insert(loc_arr,curr_country,0)
    
    # append best coeff and predicted values
    coef_array.append(best_coef_arr) 
    pred_list.append(prediction)    

### Getting parameters

In [None]:
coef_final = np.array(coef_array)
print(coef_final.shape)

In [None]:
# get list of countries
countries = train_df.columns

In [None]:
# convert to pd datafrane
df_coef = pd.DataFrame(data=coef_array, columns=countries, index=countries)

# take transpose for accurate matching
df_final_coef = df_coef.transpose()

# dump to csv
df_final_coef.to_csv('population_parameters_1.csv', encoding='cp1252')

### Getting predictions

In [None]:
pred_array = np.array(pred_list)
print(pred_array.shape)

In [None]:
# convert to pd dataframe
df = pd.DataFrame(data=pred_list)

# take transpose for accurate matching
df2 = df.transpose()

# create empty df for storing final values
df_final = pd.DataFrame(id_list, columns=['Id'])

#add cols for all other countries
for i in range(len(test_df.columns)):
    colname = test_df.columns[i]
    df_final[colname] = df2[i]

print(df_final.shape)

In [None]:
df_final.head(5)

In [None]:
# dump to csv
df_final.to_csv('population_prediction_1.csv',encoding='cp1252', index=False)