In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split  # splits data for training and testing

In [7]:
%%capture
# create loading bar
from tqdm.notebook import tqdm
tqdm().pandas()

In [2]:
count_ig_csv = 'emerging_risks_doc_count_instagram.csv'
count_tw_csv = 'emerging_risks_doc_count_twitter.csv'

engagement_fb_csv = 'emerging_risks_local_engagement_facebook.csv'
engagement_ig_csv = 'emerging_risks_local_engagement_instagram.csv'
engagement_tw_csv = 'emerging_risks_local_engagement_twitter.csv'

files = [count_ig_csv, count_tw_csv, engagement_fb_csv, engagement_ig_csv, engagement_tw_csv]

dfs = {'count_ig_csv': pd.read_csv(count_ig_csv),
       'count_tw_csv': pd.read_csv(count_tw_csv), 
       'engagement_fb_csv': pd.read_csv(engagement_fb_csv), 
       'engagement_ig_csv': pd.read_csv(engagement_ig_csv),
       'engagement_tw_csv': pd.read_csv(engagement_tw_csv)}

In [3]:
def is_up_real_target(ser: pd.Series) -> bool:  
    """
    Evaluates whether the last value of a pandas Series is higher than the first one.
    Goal: Understand if during the test period we have a positive or negative trend.
      
    :param ser: column of dataset to predict (REAL value of the part of the dataset selected for test).
    : param mean_previous_week: UNUSED right now 
    TODO: IMPROVE. Maybe compare mean_previous_week to ser.mean() ??
    """ 
    if ser.values[0] < ser.values[-1]:
        return True    
    return False

def is_trending_up_real_target(ser: pd.Series) -> bool:
    """
    Improvement of previous version: estimate sign of linear regression coefficient for real data in week 4.
    """
    model = LinearRegression()
    x = ser.index.values.reshape(-1,1)
    y = ser
    model.fit(x, y)
    if model.coef_ > 0:
        return True
    return False
    
def is_up_predicted_target(coefficients: np.array) -> bool:
    """
    Evaluates if slope of the linear regression is positive.
    Goal: Understand if during the trend period we have a positive or negative trend (calculated as slope/coefficient of a 
    regression)
    :param coefficients: coefficients of regression on column of dataset used for training.
    """
    if coefficients[0] > 0:
        return True
    return False


def update_eval_values(tp: int, tn: int, fp: int, fn: int, predicted_target:bool, real_target:bool):
    """
    Updates matrix of
     _________________________________
    | True Positive  | False Positive |
     ---------------------------------
    | False Negative | True Negative  |
     _________________________________
    
    depending on the difference 
    Goal: Considering one train/test, understand if the model is correctly predicting if the test period had a positive 
    or negative trend.
    """
    if predicted_target == 1 and real_target == 1:
        tp += 1
    elif predicted_target == 0 and real_target == 0:
        tn += 1
    elif predicted_target == 1 and real_target == 0:
        fp += 1
    elif predicted_target == 0 and real_target == 1:
        fn += 1
    return tp, tn, fp, fn

In [4]:
def confusion_matrix_baseline_model(column: pd.Series, step_days: int, month_length: int, evaluate_trend=True, 
                                    accuracy_matrix=False):
    """
    Apply linear regression model to one variable in one file and return the confusion matrix
     _________________________________
    | True Positive  | False Positive |
     ---------------------------------
    | False Negative | True Negative  |
     _________________________________      
    Time column is split into moving time windows of length = "month_length". To each time window, 
    a linear regression is applied on a "train" period of length = (month_length - step_days) and then tested on a "test" 
    period of length = step_days.
    :param evaluate_trend: if set to True, calls is_trending_up_real_target() in place of is_up_real_target()
    :param accuracy_matrix: if set to True, returns accuracy float = (tp + tn)/(tp+tn+fp+fn) rather than dict.
    """
    tp, tn, fp, fn = 0, 0, 0, 0
    for day in range(0, 364, step_days):
        month = column[day : (day + month_length)]                              # len(month) == month_length       
        train, test = month[: -step_days], month[-step_days:]
            
        model = LinearRegression()
        X_train = train.index.values.reshape(-1, 1)
        y_train = train
        model.fit(X_train, y_train)
        
        if evaluate_trend is False:
            real_target = is_up_real_target(test)
        else:       # elif evaluate_trend is True:
            real_target = is_trending_up_real_target(test)
            
        predicted_target = is_up_predicted_target(model.coef_)
        tp, tn, fp, fn = update_eval_values(tp, tn, fp, fn, predicted_target, real_target)
    
    if accuracy_matrix is True:
        return (tp + tn) / (tp + tn + fp + fn)
    
    return {"tp": tp, "tn": tn, "fp": fp, "fn": fn}

In [5]:
def get_df_matrix(data_table: pd.DataFrame, confusion=False, accuracy=False, step_days=7, month_length=28,
                  threshold=10, evaluate_trend=True) -> dict:
    """
    Return the confusion matrix or the accuracy matrix for an entire df.
    Confusion matrix for entire df is a dict of dicts.
    Accuracy matrix for entire df is a dict of floats.
    
    :param threshold: min of # of values different from 0.
    :param evaluate_trend: if set to True, calls is_trending_up_real_target() in place of is_up_real_target()
    """
    if confusion == accuracy:     # this does not catch all errors, but it is pointless to make more serious checks
        raise TypeError('Set either confusion or accuracy to True.'
                        '\nUse either get_file_matrix(df, confusion=True) or get_file_matrix(df, accuracy=true)')    
    elif confusion is True:
        matrix = {colonna: confusion_matrix_baseline_model(data_table[colonna], step_days, month_length, 
                                                                   evaluate_trend=evaluate_trend) 
                  for colonna in data_table
                  if colonna != 'date' 
                  and 
                  sum(data_table[colonna] != 0) >= threshold}    
    elif accuracy is True:
        matrix = {colonna: confusion_matrix_baseline_model(data_table[colonna], step_days, month_length, 
                                                                   evaluate_trend=evaluate_trend, accuracy_matrix=accuracy) 
                  for colonna in data_table              
                  if colonna != 'date' and 
                  sum(data_table[colonna] != 0) >= threshold}
    return matrix

In [13]:
# confusion matrixes not suited to csv export: each cell is a dictionary with the confusion matrix!
# takes 23 seconds
confusion_matrixes = {df: get_df_matrix(dfs[df], confusion=True, step_days=7, month_length=28) for df in tqdm(dfs)} 

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [8]:
# this needs 24 seconds ca. BUT DEPENDS: the shorter month_length, the longer it takes!
accuracy_matrixes = {df: get_df_matrix(dfs[df], accuracy=True, step_days=7, month_length=28) for df in tqdm(dfs)}

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [16]:
accuracy_matrixes = {df: get_df_matrix(dfs[df], accuracy=True, step_days=1, month_length=3, 
                                       evaluate_trend=False) for df in tqdm(dfs)}

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))




In [17]:
acc_df = pd.DataFrame(accuracy_matrixes)
# acc_df.to_csv("accuracy_matrix.csv")

source_accuracies = acc_df.mean(axis=0) # by column
topic_accuracies = acc_df.mean(axis=1)  # by row

print(source_accuracies)
# source_accuracies.to_csv("BaselineModelAccuracyByDoc.csv", header=["Baseline Model Avg Accuracy"])

count_ig_csv         0.884074
count_tw_csv         0.596501
engagement_fb_csv    0.847301
engagement_ig_csv    0.925420
engagement_tw_csv    0.593487
dtype: float64


In [10]:
print(topic_accuracies)
# topic_accuracies.to_csv("BaselineModelAccuracyByTopic.csv", header=["Baseline Model Avg Accuracy"])

Additive                     0.572115
Allergy                      0.500000
Almond_milk                  0.461538
Aluminium                    0.509615
Animal_Welfare               0.538462
Antibiotics                  0.466346
Atrazine                     0.538462
Baby_Milk                    0.423077
Bacteria                     0.475962
Bank                         0.570513
Biodegradable                0.581731
Biodiversity                 0.538462
Breastfeeding                0.576923
Brexit                       0.620192
CEO                          0.649038
Cancer                       0.530769
Carbohydrates                0.538462
Chemical                     0.515385
Chlorine                     0.375000
Chlorpyrifos                 0.596154
Cocoa                        0.471154
Cow                          0.526923
Dairy_Health_Impact          0.581731
Dairy_Industry_Lie           0.509615
Deforestation_Agriculture    0.507692
Diabetes                     0.591346
Employment  

In [11]:
test_df = dfs["count_tw_csv"]

In [12]:
from itertools import combinations
comb = list(combinations(range(2, 29), 2))  # get all ordered 2-numbers combinations of numbers btw 2 and 28 included
# combinations are ordered: first number is always lower

# select only relevant combinations
rel_combinations = list()
for step, per in comb:
    # to predict a period of step days, I decide to use at least twice that # of days and max 5 times the # of days
    if (per-step)/2 >= step and (per-step)/5 <= step:
        # print(per-step, step)
        rel_combinations.append((step, per))

In [27]:
sorted_avg_accuracy

{(6, 19): 0.4913257997771762,
 (2, 10): 0.48991784914115005,
 (6, 28): 0.4898933630431324,
 (6, 27): 0.48957504376890004,
 (7, 27): 0.4893577296489918,
 (3, 11): 0.48901798503899396,
 (5, 18): 0.4888947998404044,
 (3, 12): 0.4883813464905296,
 (5, 23): 0.4883628142040166,
 (4, 12): 0.48778406059959445,
 (4, 13): 0.4875706817454388,
 (8, 25): 0.48754748839172657,
 (2, 11): 0.4875173370318998,
 (2, 12): 0.4873573028912834,
 (7, 26): 0.4873039581777446,
 (4, 14): 0.4870372346100499,
 (3, 13): 0.48671017030081176,
 (5, 28): 0.4866338608857561,
 (3, 10): 0.4866305904822537,
 (7, 25): 0.48599701269604173,
 (7, 24): 0.4854368932038835,
 (5, 16): 0.4851709003856896,
 (5, 26): 0.4849049075674957,
 (5, 17): 0.4849049075674956,
 (2, 9): 0.4847434119278777,
 (4, 21): 0.48458337778726096,
 (3, 14): 0.4843227757440712,
 (7, 28): 0.48412994772218076,
 (4, 18): 0.48383655179771656,
 (4, 17): 0.48372986237063864,
 (7, 21): 0.4835698282300223,
 (6, 18): 0.4825720197357948,
 (5, 21): 0.4823779757946537,


In [10]:
z = get_df_matrix(test_df, accuracy=True, step_days=2, month_length=4, threshold=10, evaluate_trend=True)

In [11]:
z_df = pd.DataFrame(z.items(), columns=["Column", "Avg Accuracy"])
print(z_df.mean(axis=0))

Avg Accuracy    0.500907
dtype: float64


In [None]:
print("a")