Option 1: 
    * Regress against 2015 data (or end of summary year, whenever possible)
    
Option 2:
    * Regress against change over same summary period
    * Theory - this is observing structural shifts in economy (Material Flow and extractive activities) 
    ... or changes in political economy (World Bank)

Assumptions
* Country columns share the same georeferencing

In [None]:
def extract_unique(df, col):
    return df[col].unique()

def run_linear_regressions(data, year, 
                           prod_col, flow_col,
                           year_col, val_col, 
                           country_col):
    '''
    Inputs: Data, and
    Outputs: square matrix of regression coefficients for each indicator
    '''
    data = data.copy()
    
    # Only look at comparisons of traded products
    data = data[pd.notnull(data[prod_col])]
        
    # Create lists of countries, products, and flows to loop over
    all_countries, all_products, all_flows = [extract_unique(data, col) for col in [country_col, prod_col, flow_col]]
    logging.debug('all_countries: {}'.format(all_countries))
    logging.debug('all_products: {}'.format(all_products))
    logging.debug('all_flows: {}'.format(all_flows))
    
    # Result will be an upper right triangular square matrix in 4 dimensions
    results = {}
    
    ###
    ## Load GHG-GDP Divergence Data
    ###
    data_x = 
    
    
    for prod_y in all_products:
        for flow_y in all_flows:
            # Extract data
            # TO DO: allow for year ranges
            
            logging.debug('flow y: {}'.format(flow_y))
            logging.debug('prod y: {}'.format(prod_y))

            msg = "regressing GHG-GDP Divergence Index against {flow_y} of {prod_y}"
            msg = msg.format(flow_y = flow_y,
                       prod_y = prod_y)

            logging.info(msg)

            data_y = data.loc[(data[prod_col]==prod_y) & (data[year_col] == year) & (data[flow_col]==flow_y)]

            # Throw away all but intersection of countries
            keep_countries = set(data_x[country_col]) & set(data_y[country_col])
            skipped_countries = [country for country in all_countries if country not in keep_countries]

            data_x = data_x.set_index(country_col).loc[keep_countries, val_col]
            data_y = data_y.set_index(country_col).loc[keep_countries, val_col]

            # Reshape for regression
            data_x = data_x.values.reshape(-1, 1)
            data_y = data_y.values.reshape(-1, 1)

            # Split for training / test set
            X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, 
                                                                test_size=30, random_state=42)

            r_squared = -1
            if data_x.shape[0] > 0:
                # Run regression
                lm = linear_model.LinearRegression() 
                lm.fit(X_train, y_train)

                # Extract coefficient of determination (r^2)
                r_squared = lm.score(X_test, y_test)

            # Store results
            results[(flow_x, prod_x, flow_y, prod_y)] = {
                'r_squared': r_squared,
                'skipped_countries': skipped_countries
            }

    return results

def pretty_print_results(data_tuple, df_prod_names, df_flow_names):
    
    flow_x, prod_x, flow_y, prod_y = data_tuple[0]
    
    ## ALERT TO MATERIAL FLOWS!!!! DATA DOESNT USE SHORTHAND FOR EXPORT AND IMPORT
    prod_x_name = df_prod_names.loc[prod_x, 'V2']
    prod_y_name = df_prod_names.loc[prod_y, 'V2']
    try:
        flow_x_name = df_flow_names.loc[flow_x, 'Flow.name']
    except:
        flow_x_name = flow_x

    try:
        flow_y_name = df_flow_names.loc[flow_y, 'Flow.name']
    except:
        flow_y_name = flow_y
    
    new_tuple = ((flow_x_name, prod_x_name, flow_y_name, prod_y_name), data_tuple[1])
    
    return new_tuple
    