<a href="https://colab.research.google.com/github/ArsalanKhan17/World-Happiness/blob/main/Predicting_Happiness_Mini_Hackathon_for_tabular_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Objective: Predict World Happiness Rankings 

What makes the citizens of one country more happy than the citizens of other countries?  Do variables measuring perceptions of corruption, GDP, maintaining a healthy lifestyle, or social support associate with a country's happiness ranking?  

Let's use the United Nation's World Happiness Rankings country level data to experiment with models that predict happiness rankings well.


---

**Data**: 2019 World Happiness Survey Rankings


**Features**
*   Country or region
*   GDP per capita
*   Social support
*   Healthy life expectancy
*   Freedom to make life choices
*   Generosity
*   Perceptions of corruption

**Target**
*   Happiness_level (Very High = Top 20% and Very Low = Bottom 20%)

Source: https://worldhappiness.report/




# Mini-Hackathon In Class Tasks



1.   Build, save, and submit at least one Keras model.
2.   Build, save, and submit at least one Scikit-learn model.
3.   Seek advice through collaboration via Github (Live class only--Not an asynchronous activity:

*      Save notebook w/ best model to private repo
*      Invite a collaborator
*      Collaborator should submit at least two issues w/ suggestions for model improvement

4.   Brainstorm together (Add issues to class hackathon github repo)
5.   If time, improve model further!











# Import the data




In [None]:
# Colab Setup: 
# note that tabular preprocessors require scikit-learn>=0.24.0
# Newest Tensorflow 2 has some bugs for onnx conversion
!pip install scikit-learn --upgrade 
import os
os.environ['TF_KERAS'] = '1'
% tensorflow_version 1

Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (0.24.1)
`%tensorflow_version` only switches the major version: 1.x or 2.x.
You set: `1`. This will be interpreted as: `1.x`.


TensorFlow is already loaded. Please restart the runtime to change versions.


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

data=pd.read_csv("worldhappiness2019.csv")

data.head()

Unnamed: 0,Happiness_level,Country or region,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,name,region,sub-region
0,Very High,Finland,1.34,1.587,0.986,0.596,0.153,0.393,Finland,Europe,Northern Europe
1,Very High,Denmark,1.383,1.573,0.996,0.592,0.252,0.41,Denmark,Europe,Northern Europe
2,Very High,Norway,1.488,1.582,1.028,0.603,0.271,0.341,Norway,Europe,Northern Europe
3,Very High,Iceland,1.38,1.624,1.026,0.591,0.354,0.118,Iceland,Europe,Northern Europe
4,Very High,Netherlands,1.396,1.522,0.999,0.557,0.322,0.298,Netherlands,Europe,Western Europe


In [None]:
# Clean up final region data
X = data.drop(['Happiness_level', 'name', 'Country or region', 'sub-region'], axis=1)
y = data['Happiness_level']

X.shape, y.shape

((156, 7), (156,))

# Build a model to predict happiness rankings

In [None]:
# Set up training and test data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_train.columns.tolist())

(104, 7)
(104,)
['GDP per capita', 'Social support', 'Healthy life expectancy', 'Freedom to make life choices', 'Generosity', 'Perceptions of corruption', 'region']


## Preprocess data using Column Transformer and save fit preprocessor to ".pkl" file

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# We create the preprocessing pipelines for both numeric and categorical data.

numeric_features=X.columns.tolist()
numeric_features.remove('region')

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['region']

#Replacing missing values with Modal value and then one hot encoding.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# final preprocessor object set up with ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


#Fit your preprocessor object
preprocess=preprocessor.fit(X_train) 



In [None]:
# Write function to transform data with preprocessor

def preprocessor(data):
    preprocessed_data=preprocess.transform(data)
    return preprocessed_data

In [None]:
# Check shape for keras input:
preprocessor(X_train).shape # pretty small dataset

(104, 11)

In [None]:
# Check shape for keras output:
pd.get_dummies(y_train)

Unnamed: 0,Average,High,Low,Very High,Very Low
27,0,0,0,1,0
118,0,0,1,0,0
117,0,0,1,0,0
41,0,1,0,0,0
4,0,0,0,1,0
...,...,...,...,...,...
71,1,0,0,0,0
106,0,0,1,0,0
14,0,0,0,1,0
92,1,0,0,0,0


## Fit a neural network with Keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
import keras
from keras.optimizers import SGD
model = Sequential()
model.add(Dense(64, input_dim=11, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))

model.add(Dense(5, activation='softmax')) 
                                            
# Compile model
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

# Fitting the NN to the Training set
model.fit(preprocessor(X_train), pd.get_dummies(y_train), 
               batch_size = 60, 
               epochs = 300, validation_split=0.35)  



Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.callbacks.History at 0x7f9935756780>

## An important aside for production ready Keras models: 
*Keras classification model objects return the predicted probabilities of each class for every prediction.  How do we return a target label instead?*

In [None]:
# using predict_classes() for multi-class data to return predicted class index.

print(model.predict_classes(preprocessor(X_test)))

prediction_index=model.predict_classes(preprocessor(X_test))

#Now lets run some code to get keras to return the label rather than the index...

# get labels from one hot encoded y_train data
labels=pd.get_dummies(y_train).columns

# Iterate through all predicted indices using map method

predicted_labels=list(map(lambda x: labels[x], prediction_index))
print(predicted_labels)

[1 0 1 1 4 1 4 0 1 2 1 0 3 2 1 1 4 4 4 0 4 1 1 3 1 2 0 1 1 4 0 2 1 4 2 4 4
 4 1 1 4 1 4 1 3 1 4 1 1 1 1 2]
['High', 'Average', 'High', 'High', 'Very Low', 'High', 'Very Low', 'Average', 'High', 'Low', 'High', 'Average', 'Very High', 'Low', 'High', 'High', 'Very Low', 'Very Low', 'Very Low', 'Average', 'Very Low', 'High', 'High', 'Very High', 'High', 'Low', 'Average', 'High', 'High', 'Very Low', 'Average', 'Low', 'High', 'Very Low', 'Low', 'Very Low', 'Very Low', 'Very Low', 'High', 'High', 'Very Low', 'High', 'Very Low', 'High', 'Very High', 'High', 'Very Low', 'High', 'High', 'High', 'High', 'Low']


In [None]:
# Evaluate held out test data
from sklearn.metrics import accuracy_score

print("Accuracy: {:.2f}%".format(accuracy_score(y_test, predicted_labels) * 100))

Accuracy: 42.31%


## Save preprocessor to preprocessor.zip, save model to onnx, and submit model to leaderboard:
Will use the above predictions to submit to our leaderboard in a bit.

## Save preprocessor fxn to "preprocessor.zip"

In [None]:
#install aimodelshare library
! pip install aimodelshare --upgrade --extra-index-url https://test.pypi.org/simple/ 

In [None]:
#Save preprocessor function to local "preprocessor.zip" file for leaderboard submission
import aimodelshare as ai
ai.export_preprocessor(preprocessor,"")

In [None]:
#test your preprocessor
prep=ai.import_preprocessor("preprocessor.zip")
prep(X_test)

## Save keras model to onnx file.  

In [None]:
from aimodelshare.aimsonnx import model_to_onnx
# transform sklearn model to ONNX
onnx_model_keras= model_to_onnx(model, framework='keras', 
                                   transfer_learning=False,
                                   deep_learning=True,
                                   task_type='classification')

# Save model to local .onnx file
with open("onnx_model_keras.onnx", "wb") as f:
    f.write(onnx_model_keras.SerializeToString())

The ONNX operator number change on the optimization: 32 -> 12


## Aside: Example of code similar to what is run behind the scenes within our REST api:

In [None]:
# In onnx you can make predictions in the following manner.  This is what happens behinds the scenes in our live web-application.
# the json input data is sent to a REST Api, transformed to a pandas dataframe, preprocessed, then predictions are generated from our onnx model.

import onnxruntime as rt
sess= rt.InferenceSession("onnx_model_keras.onnx")
input_name = sess.get_inputs()[0].name
bodydict={ 'GDP per capita': [1], 'Social support': [1], 'Healthy life expectancy': [1], 'Freedom to make life choices': [1], 'Generosity': [1], 'Perceptions of corruption': [1],'region': ['Europe']}


bodynew = pd.DataFrame.from_dict(bodydict)

input_data=preprocessor(bodynew).astype("float32")
input_data

array([[ 0.21633796, -0.6604822 ,  1.0795562 ,  3.9406095 ,  8.006234  ,
         8.378734  ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ]], dtype=float32)

In [None]:
# Here is the resulting predicted probability for each of the five cats of our target variable
res = sess.run(None,  {input_name: input_data})
res[0]

array([[9.0735066e-06, 6.0902587e-03, 1.3971618e-05, 9.9385393e-01,
        3.2791515e-05]], dtype=float32)

# Submit model to World Happiness Leaderboard

### To submit a model you need to sign up for username and password at:
[AI Model Share Initiative Site](http://mlsite5aimodelshare-dev.s3-website.us-east-2.amazonaws.com/login)

# Set up necessary arguments for model submission using aimodelshare python library.

## Required information for tabular models:
* api_url ( the api url for whatever aimodelshare project you are submitting a model to)
* aws key  and password (provided for you)
* model file path
* preprocessor file path



In [None]:
#aimodelshare username and password
username = "---"
password = "---"

# load submit model creds (only gives access to s3 bucket)
# Load from pkl file
import pickle
with open("aws_creds_worldhappiness.pkl", 'rb') as file:
    aws_key,aws_password,region = pickle.load(file)

token=ai.aws.get_aws_token(username, password)
awscreds=ai.aws.get_aws_client(aws_key=aws_key, aws_secret=aws_password, aws_region=region)

In [None]:
# Submit_model() to leaderboard
ai.submit_model("onnx_model_keras.onnx",
                "https://z69mxrxdz5.execute-api.us-east-1.amazonaws.com/prod/m",
                token,awscreds,prediction_submission=predicted_labels,
                preprocessor="preprocessor.zip")

True

# Now you can check the leaderboard!

In [None]:
import pandas
data=ai.get_leaderboard("https://z69mxrxdz5.execute-api.us-east-1.amazonaws.com/prod/m",
                token,awscreds,verbose=2)

#get rid of any duplicate model submissions
#data=data.loc[data.iloc[:,0:8].duplicated()==False,:]
data

Unnamed: 0,accuracy,f1_score,precision,recall,ml_framework,transfer_learning,deep_learning,model_type,depth,num_params,dense_layers,loss,optimizer,model_config,username,timestamp,version
0,0.519231,0.523419,0.567857,0.533333,keras,True,True,Sequential,4.0,35205.0,4.0,str,SGD,"{'name': 'sequential_5', 'layers': [{'class_na...",prajseth,2021-02-02 00:05:49.874938,15
1,0.5,0.50223,0.560272,0.514848,keras,True,True,Sequential,4.0,185705.0,4.0,str,SGD,"{'name': 'sequential_7', 'layers': [{'class_na...",prajseth,2021-02-02 00:14:50.818495,24
2,0.480769,0.483535,0.516468,0.486818,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_2', 'layers': [{'class_na...",xc2303,2021-02-02 00:14:45.315470,23
3,0.480769,0.483535,0.516468,0.486818,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_2', 'layers': [{'class_na...",xc2303,2021-02-02 00:07:31.321181,18
4,0.461538,0.461969,0.524762,0.480303,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_1', 'layers': [{'class_na...",chengzhong,2021-02-02 00:01:24.373356,9
5,0.461538,0.452692,0.507778,0.475152,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_288', 'layers': [{'class_...",dv2438,2021-02-02 00:16:03.790075,26
6,0.461538,0.452368,0.480112,0.471818,keras,False,True,Sequential,4.0,35205.0,4.0,str,SGD,"{'name': 'sequential_11', 'layers': [{'class_n...",mr3536,2021-02-02 08:16:38.120754,28
7,0.442308,0.442941,0.491746,0.461818,keras,True,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_4', 'layers': [{'class_na...",prajseth,2021-02-02 00:03:50.176736,12
8,0.423077,0.423022,0.482143,0.442121,keras,True,True,Sequential,4.0,534021.0,4.0,str,SGD,"{'name': 'sequential_6', 'layers': [{'class_na...",prajseth,2021-02-02 00:09:36.685219,20
9,0.423077,0.426294,0.473109,0.435303,sklearn,False,False,RandomForestClassifier,,,,,,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",prajseth,2021-02-02 00:15:21.257543,25


In [None]:
ai.stylize_leaderboard(data, category="classification")

Unnamed: 0,accuracy,f1_score,precision,recall,ml_framework,transfer_learning,deep_learning,model_type,depth,num_params,dense_layers,loss,optimizer,model_config,username,version
0,46.15%,46.20%,52.48%,48.03%,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_1', 'laye...",chengzhong,9
1,44.23%,41.41%,47.09%,45.18%,keras,False,True,Sequential,6.0,787273.0,6.0,str,SGD,"{'name': 'sequential_1', 'laye...",kagenlim,5
2,40.38%,38.96%,46.28%,42.86%,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_3', 'laye...",prajseth,10
3,40.38%,40.08%,44.15%,41.38%,keras,False,True,Sequential,4.0,286273.0,4.0,str,SGD,"{'name': 'sequential_2', 'laye...",kagenlim,6
4,40.38%,39.78%,44.14%,42.03%,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_2', 'laye...",prajseth,7
5,40.38%,37.92%,42.37%,41.85%,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_9', 'laye...",newusertest,1
6,38.46%,38.55%,42.10%,39.53%,sklearn,False,False,RandomForestClassifier,,,,,,"{'bootstrap': True, 'ccp_alpha...",newusertest,3
7,38.46%,38.55%,42.10%,39.53%,sklearn,False,False,RandomForestClassifier,,,,,,"{'bootstrap': True, 'ccp_alpha...",helloworld,4
8,38.46%,38.55%,42.10%,39.53%,sklearn,False,False,RandomForestClassifier,,,,,,"{'bootstrap': True, 'ccp_alpha...",prajseth,8
9,38.46%,38.55%,42.10%,39.53%,sklearn,False,False,RandomForestClassifier,,,,,,"{'bootstrap': True, 'ccp_alpha...",prajseth,11


## Build, save, and submit a sklearn model


In [None]:
# Build, save, and submit a sklearn model

from numpy import loadtxt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
model=RandomForestClassifier(n_estimators=1000, random_state = 0)
#Train the model using the training sets y_pred=clf.predict(X_test)
model.fit(preprocessor(X_train), y_train)


#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
#print("Accuracy on Test Data:",metrics.accuracy_score(y_test, y_pred))

print("Random Forest Classifier's cross validation accuracy:", np.mean(cross_val_score(model, preprocessor(X_train), y_train, cv=10)))
print("Random Forest Classifier's Test-Data prediction accuracy: {:.5f}".format(model.score(preprocessor(X_test), y_test)))


Random Forest Classifier's cross validation accuracy: 0.6454545454545455
Random Forest Classifier's Test-Data prediction accuracy: 0.38462


In [None]:
predicted_labels=model.predict(preprocessor(X_test))

In [None]:
preprocessor(X_train).shape

(104, 11)

In [None]:
# Save sklearn model to onnx file
import aimodelshare as ai
from aimodelshare.aimsonnx import model_to_onnx

# How many preprocessed input features are there?
# Change numeric value in FloatTensorType to update
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, 11]))]

# transform sklearn model to ONNX
onnx_model_sklearn = model_to_onnx(model, framework='sklearn', 
                                   initial_types=initial_type,                  # think about automating initial types
                                   transfer_learning=False,
                                   deep_learning=False,
                                   task_type='classification')

# Save model to local .onnx file
with open("onnx_model_sklearn.onnx", "wb") as f:
    f.write(onnx_model_sklearn.SerializeToString())

In [None]:
#aimodelshare username and password
username = "your aimodelshare username here"
password = "your aimodelshare password here"

# load submit model creds (only gives access to s3 bucket)
# Load from pkl file
with open("aws_creds_worldhappiness.pkl", 'rb') as file:
    aws_key,aws_password,region = pickle.load(file)

token=ai.aws.get_aws_token(username, password)
awscreds=ai.aws.get_aws_client(aws_key=aws_key, aws_secret=aws_password, aws_region=region)

In [None]:
# test submit_model()
ai.submit_model("onnx_model_sklearn.onnx",
                "https://z69mxrxdz5.execute-api.us-east-1.amazonaws.com/prod/m",
                token,awscreds,prediction_submission=predicted_labels,
                preprocessor="preprocessor.zip")

True

In [None]:
# Check leaderboard
import pandas
data=ai.get_leaderboard("https://z69mxrxdz5.execute-api.us-east-1.amazonaws.com/prod/m",
                token,awscreds,verbose=2)

#get rid of any duplicate model submissions
#data=data.loc[data.iloc[:,0:8].duplicated()==False,:]
data

Unnamed: 0,accuracy,f1_score,precision,recall,ml_framework,transfer_learning,deep_learning,model_type,depth,num_params,dense_layers,loss,optimizer,model_config,username,timestamp,version
0,0.403846,0.379187,0.423724,0.418485,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_9', 'layers': [{'class_na...",newusertest,2021-02-01 19:48:41.733994,1
1,0.384615,0.385546,0.421046,0.395303,sklearn,False,False,RandomForestClassifier,,,,,,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",newusertest,2021-02-01 20:01:00.150617,3
2,0.384615,0.385546,0.421046,0.395303,sklearn,False,False,RandomForestClassifier,,,,,,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",helloworld,2021-02-01 20:05:51.390852,4


In [None]:
ai.stylize_leaderboard(data, category="classification")

Unnamed: 0,accuracy,f1_score,precision,recall,ml_framework,transfer_learning,deep_learning,model_type,depth,num_params,dense_layers,loss,optimizer,model_config,username,version
0,40.38%,37.92%,42.37%,41.85%,keras,False,True,Sequential,4.0,9413.0,4.0,str,SGD,"{'name': 'sequential_9', 'laye...",newusertest,1
1,38.46%,38.55%,42.10%,39.53%,sklearn,False,False,RandomForestClassifier,,,,,,"{'bootstrap': True, 'ccp_alpha...",newusertest,3
2,38.46%,38.55%,42.10%,39.53%,sklearn,False,False,RandomForestClassifier,,,,,,"{'bootstrap': True, 'ccp_alpha...",helloworld,4
