In [None]:
import pandas as pd
import evalml
import woodwork as ww
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from featuretools.selection import remove_low_information_features, remove_highly_null_features, remove_single_value_features, remove_highly_correlated_features

In [None]:
#possible_names = ['busi', 'econ', 'educ', 'envi', 'gove', 'heal', 'pers', 'safe', 'soci']

'''
@param name of pilar
@param list of countries
@return dataframe of results
'''

def pillar(name = 'busi', countries = ['Chad', 'Togo', 'Zimbabwe', 'Ivory Coast', 'Georgia']):
    url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/'
    df = pd.read_csv(url+'name+'_train.csv')
    df = df.drop(['Unnamed: 0'], axis = 1)

    for i in df.columns:
        if i.find('year') > -1:
            df = df.drop([i], axis = 1)

    y = df[name]

    df = df.drop(['rank_'+name, name], axis = 1)

    df = remove_low_information_features(df)

    df = remove_highly_null_features(df)

    df = remove_single_value_features(df)

    df = remove_highly_correlated_features(df)

    X = df

    problem_type = 'regression'
    objective =  'auto'

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify = X['country'])


    automl = evalml.automl.AutoMLSearch(X_train, y_train, problem_type=problem_type, objective = objective

    best_pipeline = automl.load(name+'_best_pipeline')
                                        
    df = pd.read_csv(url+name+'_test.csv')
    df = df.drop(['Unnamed: 0'], axis = 1)

    for i in df.columns:
        if i.find('year') > -1:
            df = df.drop([i], axis = 1)

    df = remove_low_information_features(df)

    df = remove_highly_null_features(df)

    df = remove_single_value_features(df)

    df = remove_highly_correlated_features(df)

    predictions = best_pipeline.predict(df)

    predictions = predictions.to_series()

    result = pd.DataFrame()

    result[name] = predictions

    df = pd.read_csv(url+name+'_test.csv')
    temp = df[['country', 'year']]

    result = pd.merge(left = temp, right = result, how="left", on=[temp.index, result.index])
    result = result.drop(['key_0', 'key_1'], axis = 1)

    result['rank_'+name] = result.groupby("year")[name].rank("dense", ascending=False)
    result['rank_'+name] = result['rank_'+name].astype('int')

    result = result[result['country'].isin(countries)]
                                        
    return result

In [None]:
'''
@param list of countries
@return dataframe with results
'''

def prosperity(countries = ['Chad', 'Togo', 'Zimbabwe', 'Ivory Coast', 'Georgia']):
    
    url = 'https://raw.githubusercontent.com/Andrewl7127/UCSD-DataHacks-2021/main/Data/'
    df = pd.read_csv(url+'merged.csv')
    df = df.drop(['Unnamed: 0'], axis = 1)
    
    metrics = ['educ', 'soci', 'heal', 'pers', 'busi', 'econ', 'safe', 'gove', 'envi']
    ranks = ['rank_' + metric for metric in metrics]
    drop = metrics + ranks + ['year', 'prosperity_score']
    
    y = df['prosperity_score']

    df = df.drop(drop, axis = 1)

    df = remove_low_information_features(df)

    df = remove_highly_null_features(df)

    df = remove_single_value_features(df)

    df = remove_highly_correlated_features(df)

    X = df
    
    problem_type = 'regression'
    objective =  'auto'

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify = X['country'])


    automl = evalml.automl.AutoMLSearch(X_train, y_train, problem_type=problem_type, objective = objective)
    
    best_pipeline = automl.load('prosperity_best_pipeline')
    
    test = pd.read_csv(url+'test.csv', index_col = 0)
    
    drop = ['year']
    df = test.copy()
    df = df.drop(drop, axis = 1)

    df = remove_low_information_features(df)

    df = remove_highly_null_features(df)

    df = remove_single_value_features(df)

    df = remove_highly_correlated_features(df)

    X = df
    
    predictions = best_pipeline.predict(X)

    predictions = predictions.to_series()

    result = pd.DataFrame()

    result['prosperity'] = predictions
    
    df = pd.read_csv('test.csv')
    temp = df[['country', 'year']]

    result = pd.merge(left = temp, right = result, how="left", on=[temp.index, result.index])
    result = result.drop(['key_0', 'key_1'], axis = 1)

    result['rank_prosperity'] = result.groupby("year")["prosperity"].rank("dense", ascending=False)
    result['rank_prosperity'] = result['rank_prosperity'].astype('int')

    result = result[result['country'].isin(['Chad', 'Togo', 'Zimbabwe', 'Ivory Coast', 'Georgia'])]
    
    return result