## Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.metrics import r2_score 

## Importing the Data

In [2]:
train_data = pd.read_csv('ECEN689-Fall2018\\Challenges\\3Files\\population_training_kaggle.csv', encoding='cp1252')
test_data = pd.read_csv('ECEN689-Fall2018\\Challenges\\3Files\\population_testing_kaggle.csv', encoding='cp1252')

## Extracting out the ID column from the test data so that we can add it to our final prediction file

In [3]:
test_data_Id = test_data['Id']

## Dropping the ID column from both training and testing data

In [4]:
train_data.drop(['Id'], axis=1, inplace=True)
test_data.drop(['Id'], axis=1, inplace=True)

In [5]:
train_without_scaling = train_data

## Extracting out the names of the countries to be finally added to population parameters file

In [6]:
names = train_data.columns

## Standardizing the data (centralizing around mean and normalizing with standard deviation) using sklearn's standardscaler() method 

In [7]:
scaler = preprocessing.StandardScaler()
train = scaler.fit_transform(train_data)
test = scaler.transform(test_data)

In [8]:
train_data = pd.DataFrame(data = train, columns = names)
test_data = pd.DataFrame(data = test, columns = names)

### Till this point, we have the standardized train and test dataset. Now we will find alphas (regularization parameter) for lasso regression such that each country gets only 5 non zero coefficients. For finding the alphas, we are applying an algorithm which will start with some num as alpha. Now, for this alpha, if we are getting more than 5 non zero coefficients, then we will scale up/ increase the num. If we are getting less than 5 non zero coefficients, then we will scale down/decrease the num. And, if we get exactly 5 non zero coefficients, then we will finalize that num as alpha value for that particular country. Like this, we will iterate for every country.    

In [9]:
regressor = Lasso(fit_intercept=False, normalize=False)
alpha_list=[]

for i in range(0,258):
    
    y_train = train_data[train_data.columns[i]].values
    train = train_data.drop(train_data.columns[i], axis=1)
    x_train = train.values
    
    num = 0.5
    
    while True:
        regressor.set_params(alpha = num)
        regressor.fit(x_train, y_train)
        cnt = np.count_nonzero(regressor.coef_)
        if cnt==5:
            alpha_list.append(num)
            break
        elif cnt>5:
            num = num*1.5
        else:
            num = num*0.5





























































In [10]:
print(alpha_list)

[0.0625, 0.03125, 0.375, 0.1875, 0.375, 0.7314722413363803, 0.125, 0.75, 0.1875, 0.15016937255859375, 0.015625, 0.31640625, 0.125, 0.25, 0.75, 0.09375, 0.2373046875, 0.29691564860476627, 0.5, 0.0333709716796875, 0.375, 0.15221817306883167, 0.0087890625, 0.11865234375, 0.6757621765136719, 0.125, 0.6328125, 0.375, 0.140625, 0.5, 0.2109375, 0.140625, 0.75, 0.375, 0.22525405883789062, 0.0889892578125, 0.140625, 0.40045166015625, 0.33788108825683594, 0.5, 0.5, 0.5, 0.533935546875, 0.31640625, 0.375, 0.375, 0.31640625, 0.25, 0.11416362980162376, 0.03125, 0.019775390625, 0.25, 0.140625, 0.03125, 0.125, 0.0625, 0.125, 0.6757621765136719, 0.533935546875, 0.43618895195255325, 0.7119140625, 0.5, 0.25, 0.31640625, 0.75, 0.7119140625, 0.13348388671875, 0.13348388671875, 0.006591796875, 0.32509877392728015, 0.125, 0.5110726034865821, 0.5625, 0.28125, 0.28125, 0.31640625, 0.1875, 0.35595703125, 0.125, 0.0625, 0.421875, 0.125, 0.09375, 0.375, 0.10546875, 0.0703125, 0.1875, 0.03125, 0.25, 0.474609375, 

### Till this point, we have found the alphas for each country such that for this alpha, that country will have 5 non zero coefficients. Now we will train the lasso regression model and calculate coefficients and predictions for each country.  

In [11]:
coef_final = []
regressor = Lasso(fit_intercept=False, normalize=False)
pred_list=[]
error_list = []


for i in range(0,258):
    
    alpha = alpha_list[i]
    
    y_train = train_data[train_data.columns[i]].values
    train = train_data.drop(train_data.columns[i], axis=1)
    x_train = train.values
    
    y_test = test_data[test_data.columns[i]].values
    test = test_data.drop(test_data.columns[i], axis=1)
    x_test = test.values
    
    col = train_without_scaling[train_without_scaling.columns[i]].values
    st = np.std(col)
    mu = np.mean(col)
    
    
    regressor.set_params(alpha = alpha)
    regressor.fit(x_train, y_train)
    
    prediction = regressor.predict(x_test)
    coef = regressor.coef_
    error = mean_squared_error(prediction,y_test)
    
    prediction = prediction*st + mu
    
    final_coef=[]
    for j in range(i):
        final_coef.append(coef[j])
    final_coef.append(0)
    for j in range(i, 257):
        final_coef.append(coef[j])
            
    coef_final.append(final_coef) 
    error_list.append(error)
    pred_list.append(prediction)





### Exporting out population prediction

In [20]:
pred_array = np.array(pred_list)
pred_array = pred_array.transpose()
pred_df = pd.DataFrame(data = pred_array, columns = names)
pred_df.insert(0, 'Id', test_data_Id)
pred_df.to_csv('population_prediction.csv', index=False)

In [None]:
###

In [21]:
coef_array = np.array(coef_final)
coef_df = pd.DataFrame(data = coef_array, columns = names, index = names)
coef_df.to_csv('population_parameters.csv')

In [14]:
print(pred_list)

[array([ 88483.40407392,  91394.58254717,  94694.61562264,  97898.02538076,
       101043.55907436, 103971.69878173, 106618.81451167, 109052.46877417,
       111401.43189304, 113845.60475902, 116520.3843221 , 119465.8110335 ,
       122618.927667  , 125899.92748266, 129176.84340346, 132361.46748925,
       135433.41221017]), array([19994249.74741469, 20601281.83871356, 21192378.16339027,
       21754597.32660469, 22268910.54072862, 22713009.58485982,
       23054421.26725594, 23303544.20240868, 23512762.58945619,
       23714116.70422304, 23932950.96384966, 24205210.33701362,
       24522838.03167988, 24863812.73245938, 25219800.68955785,
       25593828.67828033, 25952308.76254376]), array([13845323.78289417, 14132473.80542769, 14424012.26486768,
       14721023.14004406, 15025141.2172822 , 15337283.11914402,
       15658744.1653815 , 15988936.06062154, 16324476.9938547 ,
       16660589.83899874, 16994350.70480713, 17324327.09959171,
       17652788.7634653 , 17985006.4473772 , 18328

### We can calculate the total prediction error

In [16]:
print(np.sum(error_list))

560.1235705246349
