# F1-scores

Import modules and load data from CSV

In [1]:
import os
import pandas as pd
import geopandas as gpd
from scipy import stats
from sklearn.metrics import classification_report, f1_score, precision_recall_fscore_support, accuracy_score

# Read data from CSV
folder = r"./../valid_results"
df = pd.read_csv(os.path.join(folder, "results_combined_by_user.csv"), sep=";")

Filter the data to include only those rows with expert annotations.

In [2]:
df = df.dropna(subset=['expert_1', 'expert_2'])

Get only rows where the expert annotators agree on the classification.

In [3]:
df = df.loc[df['expert_1'] == df['expert_2']]

Check number of rows

In [4]:
len(df)

375

In [5]:
# Drop columns not needed for calculations (expert 2 == expert 1 == the "true" values)
df = df.drop(columns=["userid","expert_2"])

In [6]:
df.head()

Unnamed: 0,B_CircleCentroids,B_dbscan_500_km,B_EllipseCentroids,B_maxdays,B_maxmonths,B_maxposts,B_maxtimedelta,B_maxweeks,B_MeanCenters,B_MedianCenters,...,H_dbscan_500km,H_EllipseCentroids,H_maxdays,H_maxmonths,H_maxposts,H_maxtimedelta,H_maxweeks,H_MeanCenters,H_MedianCenters,expert_1
5,SF,SF,SF,SF,SF,SF,SF,SF,SF,SF,...,SF,SF,SF,SF,SF,SF,SF,SF,SF,SF
10,SF,SF,SF,SF,SF,SF,SF,SF,SF,SF,...,SF,SF,SF,SF,SF,SF,SF,SF,SF,SF
12,FR,SP,FR,SP,SP,SP,SP,SP,FR,SP,...,SP,SP,SP,SP,SP,SP,SP,SP,SP,SP
20,SF,SF,SF,SF,SF,SF,SF,SF,SF,SF,...,SF,SF,SF,SF,SF,SF,SF,SF,SF,SF
22,BD,US,BD,US,US,US,US,US,BD,US,...,US,US,US,US,US,US,US,US,US,US


## Compute the F1-scores

Example classification report for one method

In [7]:
print(classification_report(df['expert_1'].tolist(), df['B_maxdays'].tolist(), zero_division=False))

              precision    recall  f1-score   support

          AE       0.00      0.00      0.00         0
          AR       1.00      1.00      1.00         1
          AS       0.86      0.92      0.89        13
          BR       0.92      0.86      0.89        14
          CA       0.80      1.00      0.89         4
          CE       0.00      0.00      0.00         0
          CH       0.00      0.00      0.00         3
          CO       0.00      0.00      0.00         1
          DA       1.00      1.00      1.00         2
          EI       1.00      1.00      1.00         2
          EZ       1.00      1.00      1.00         1
          FR       0.83      1.00      0.91         5
          GJ       1.00      1.00      1.00         1
      GLOBAL       0.00      0.00      0.00         3
          GM       1.00      0.50      0.67         6
          GR       0.00      0.00      0.00         0
          HK       1.00      1.00      1.00         1
          IC       0.00    

**Calculate f1-score for all methods**
https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html

- `micro`: *Calculate metrics globally by counting the total true positives, false negatives and false positives.*

- `macro`: *Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account.*

- `weighted`: *Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters ‘macro’ to account for label imbalance; it can result in an F-score that is not between precision and recall.*


**Calculate Precision & recall**

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_fscore_support.html#sklearn.metrics.precision_recall_fscore_support

returns (precision, recall, fbeta_score, support)


 

In [8]:
# Calculate f-score only:

# Apply f1_score-calculation as expert assesment as true values, and each column as predicted values
#micro_avg = df.drop(columns=["expert_1"]).apply(lambda x: f1_score(df['expert_1'].tolist(), x.tolist(), average='micro', zero_division=0), axis=0)
#macro_avg = df.drop(columns=["expert_1"]).apply(lambda x: f1_score(df['expert_1'].tolist(), x.tolist(), average='macro', zero_division=0), axis=0)
#weighted_avg = df.drop(columns=["expert_1"]).apply(lambda x: f1_score(df['expert_1'].tolist(), x.tolist(), average='weighted', zero_division=0), axis=0)

In [9]:
# Method for calculating f1-score averages for all mehtods, and related precision and recall

def get_f1(dataframe, average_type="macro", true_column="expert_1"):
    """Get f1 scores average of a spesific type, and related precision and recall"""
    
    f1 = dataframe.drop(columns=[true_column]).apply(lambda x: f1_score(dataframe[true_column].tolist(), 
                                                              x.tolist(), 
                                                              average=average_type, 
                                                              zero_division=0), 
                                           axis=0)
    
    precision = dataframe.drop(columns=[true_column]).apply(lambda x: precision_recall_fscore_support(dataframe[true_column].tolist(), 
                                                                                                      x.tolist(), 
                                                                                                      average=average_type, 
                                                                                                      zero_division=0)[0], 
                                                            axis=0)
    
    recall = dataframe.drop(columns=[true_column]).apply(lambda x: precision_recall_fscore_support(dataframe[true_column].tolist(), 
                                                                                                  x.tolist(), 
                                                                                                  average=average_type, 
                                                                                                  zero_division=0)[1], 
                                                        axis=0)
    
    # Combine scores from pandas series to one dataframe 
    # Type will be replaced in a later step with basic/hierarchical for alphabetical sorting purposes
    scores= {'n{}_TYPE_F1_'.format(str(len(dataframe))): f1, 
             'n{}_TYPE_Pr_'.format(str(len(dataframe))): precision, 
             'n{}_TYPE_Re_'.format(str(len(dataframe))): recall }
    
    scores = pd.DataFrame(scores)
    scores.sort_values(by=scores.columns[0],ascending=False, inplace=True)
    
    return scores


In [10]:
weighted_avg_all = get_f1(df, average_type="weighted", true_column="expert_1")
weighted_avg_all

Unnamed: 0,n375_TYPE_F1_,n375_TYPE_Pr_,n375_TYPE_Re_
B_maxmonths,0.916217,0.915307,0.92
H_maxmonths,0.915853,0.91744,0.917333
H_maxweeks,0.907556,0.909014,0.909333
B_maxdays,0.907287,0.912243,0.906667
H_maxdays,0.905964,0.916847,0.898667
B_maxweeks,0.90392,0.906211,0.906667
H_maxposts,0.89138,0.901614,0.885333
B_maxposts,0.89044,0.896367,0.890667
B_maxtimedelta,0.878596,0.88391,0.885333
H_maxtimedelta,0.872827,0.884548,0.874667


In [11]:
macro_avg_all = get_f1(df, average_type="macro", true_column="expert_1")
macro_avg_all

Unnamed: 0,n375_TYPE_F1_,n375_TYPE_Pr_,n375_TYPE_Re_
B_maxmonths,0.696804,0.690731,0.717977
B_maxweeks,0.653813,0.647967,0.674035
B_maxtimedelta,0.652324,0.653433,0.683981
H_maxmonths,0.646735,0.641839,0.665741
H_maxweeks,0.626859,0.617331,0.648049
B_maxdays,0.614785,0.615139,0.624083
B_maxposts,0.593689,0.603459,0.598832
H_maxtimedelta,0.580292,0.585785,0.610674
H_maxposts,0.573858,0.582399,0.579194
H_maxdays,0.570203,0.564412,0.589338


Remove South Africa from the data, and compute f1-scores:

In [12]:
# All users except those from South Africa
nosf = df.loc[df["expert_1"]!="SF"]

weighted_avg_nosf = get_f1(nosf, average_type="weighted", true_column="expert_1")
weighted_avg_nosf 

Unnamed: 0,n203_TYPE_F1_,n203_TYPE_Pr_,n203_TYPE_Re_
B_maxmonths,0.881978,0.891662,0.876847
H_maxweeks,0.879367,0.892819,0.871921
H_maxmonths,0.879175,0.891844,0.871921
B_maxdays,0.876459,0.893263,0.866995
H_maxdays,0.873452,0.89621,0.857143
B_maxweeks,0.86974,0.886106,0.862069
H_maxposts,0.855985,0.882459,0.837438
B_maxposts,0.854693,0.877043,0.842365
H_dbscan_500km,0.830468,0.860958,0.807882
B_dbscan_500_km,0.819726,0.855418,0.793103


In [13]:
# All users except those from South Africa
nosf = df.loc[df["expert_1"]!="SF"]

macro_avg_nosf = get_f1(nosf, average_type="macro", true_column="expert_1")
macro_avg_nosf 

Unnamed: 0,n203_TYPE_F1_,n203_TYPE_Pr_,n203_TYPE_Re_
B_maxmonths,0.677945,0.673603,0.693082
H_maxmonths,0.644457,0.641143,0.658297
B_maxweeks,0.639936,0.639952,0.650052
B_maxtimedelta,0.629253,0.632221,0.659085
H_maxweeks,0.628812,0.625044,0.640599
B_maxdays,0.618665,0.626798,0.61624
B_maxposts,0.599907,0.619833,0.590526
H_maxdays,0.582075,0.57957,0.594588
H_maxposts,0.573488,0.58643,0.570952
H_maxtimedelta,0.558718,0.565925,0.587557


In [14]:
# All users except those from South Africa
nosf = df.loc[df["expert_1"]!="SF"]

weighted_avg_sf = get_f1(nosf, average_type="weighted", true_column="expert_1")
weighted_avg_sf 

Unnamed: 0,n203_TYPE_F1_,n203_TYPE_Pr_,n203_TYPE_Re_
B_maxmonths,0.881978,0.891662,0.876847
H_maxweeks,0.879367,0.892819,0.871921
H_maxmonths,0.879175,0.891844,0.871921
B_maxdays,0.876459,0.893263,0.866995
H_maxdays,0.873452,0.89621,0.857143
B_maxweeks,0.86974,0.886106,0.862069
H_maxposts,0.855985,0.882459,0.837438
B_maxposts,0.854693,0.877043,0.842365
H_dbscan_500km,0.830468,0.860958,0.807882
B_dbscan_500_km,0.819726,0.855418,0.793103


Remove all others but South Africa from the data, and compute f1-scores:

In [15]:
# Only users from Africa
sf = df.loc[df["expert_1"]=="SF"]


# NOTE! Micro average!
micro_avg_sf = get_f1(sf, average_type="micro", true_column="expert_1")
micro_avg_sf 

Unnamed: 0,n172_TYPE_F1_,n172_TYPE_Pr_,n172_TYPE_Re_
H_maxmonths,0.97093,0.97093,0.97093
B_maxmonths,0.97093,0.97093,0.97093
B_maxtimedelta,0.97093,0.97093,0.97093
H_maxtimedelta,0.97093,0.97093,0.97093
B_maxweeks,0.959302,0.959302,0.959302
B_maxdays,0.953488,0.953488,0.953488
H_maxweeks,0.953488,0.953488,0.953488
B_maxposts,0.947674,0.947674,0.947674
H_maxdays,0.947674,0.947674,0.947674
H_MedianCenters,0.94186,0.94186,0.94186


In [16]:
# Only users from Africa
sf = df.loc[df["expert_1"]=="SF"]

macro_avg_sf = get_f1(sf, average_type="macro", true_column="expert_1")
macro_avg_sf 

Unnamed: 0,n172_TYPE_F1_,n172_TYPE_Pr_,n172_TYPE_Re_
B_maxmonths,0.19705,0.2,0.194186
B_maxtimedelta,0.19705,0.2,0.194186
H_maxtimedelta,0.19705,0.2,0.194186
H_maxmonths,0.164208,0.166667,0.161822
B_maxweeks,0.163205,0.166667,0.159884
H_maxweeks,0.139456,0.142857,0.136213
B_maxdays,0.122024,0.125,0.119186
H_maxdays,0.108126,0.111111,0.105297
B_maxposts,0.108126,0.111111,0.105297
H_MedianCenters,0.107784,0.111111,0.104651


### Combine results

In [17]:
weighted_avgs = weighted_avg_all.join(weighted_avg_nosf, 
                                lsuffix="All_users_n375", 
                                rsuffix="Excluding_SF_n203")

In [18]:
weighted_avgs.head()

Unnamed: 0,n375_TYPE_F1_,n375_TYPE_Pr_,n375_TYPE_Re_,n203_TYPE_F1_,n203_TYPE_Pr_,n203_TYPE_Re_
B_maxmonths,0.916217,0.915307,0.92,0.881978,0.891662,0.876847
H_maxmonths,0.915853,0.91744,0.917333,0.879175,0.891844,0.871921
H_maxweeks,0.907556,0.909014,0.909333,0.879367,0.892819,0.871921
B_maxdays,0.907287,0.912243,0.906667,0.876459,0.893263,0.866995
H_maxdays,0.905964,0.916847,0.898667,0.873452,0.89621,0.857143


In [19]:
macro_avgs = macro_avg_all.join(macro_avg_nosf)

In [20]:
macro_and_micro = macro_avgs.join(micro_avg_sf) 

### Place basic and hierarchical methods into separate columns

In [21]:
def detect_method_type(df, method_type="B_"):
        # Find all rows with basic/hierarchical methods
        methods = [x for x in df.index.values if x.startswith(method_type)]

        # Locate rows with basic methods
        stats = df.loc[methods]

        # Remove method identifier from index
        stats.index = [x.strip(method_type) for x in stats.index.values]
        stats.index = [x.replace("500km", "500_km") for x in stats.index.values]
        
        stats.columns = [x.replace("TYPE_", method_type) for x in stats.columns.values]
            
        return stats

In [22]:
basic = detect_method_type(macro_and_micro, "B_")
hierarchical = detect_method_type(macro_and_micro, "H_")

micro_macro_results = basic.join(hierarchical)

In [23]:
micro_macro_results

Unnamed: 0,n375_B_F1_,n375_B_Pr_,n375_B_Re_,n203_B_F1_,n203_B_Pr_,n203_B_Re_,n172_B_F1_,n172_B_Pr_,n172_B_Re_,n375_H_F1_,n375_H_Pr_,n375_H_Re_,n203_H_F1_,n203_H_Pr_,n203_H_Re_,n172_H_F1_,n172_H_Pr_,n172_H_Re_
maxmonths,0.696804,0.690731,0.717977,0.677945,0.673603,0.693082,0.97093,0.97093,0.97093,0.646735,0.641839,0.665741,0.644457,0.641143,0.658297,0.97093,0.97093,0.97093
maxweeks,0.653813,0.647967,0.674035,0.639936,0.639952,0.650052,0.959302,0.959302,0.959302,0.626859,0.617331,0.648049,0.628812,0.625044,0.640599,0.953488,0.953488,0.953488
maxtimedelta,0.652324,0.653433,0.683981,0.629253,0.632221,0.659085,0.97093,0.97093,0.97093,0.580292,0.585785,0.610674,0.558718,0.565925,0.587557,0.97093,0.97093,0.97093
maxdays,0.614785,0.615139,0.624083,0.618665,0.626798,0.61624,0.953488,0.953488,0.953488,0.570203,0.564412,0.589338,0.582075,0.57957,0.594588,0.947674,0.947674,0.947674
maxposts,0.593689,0.603459,0.598832,0.599907,0.619833,0.590526,0.947674,0.947674,0.947674,0.573858,0.582399,0.579194,0.573488,0.58643,0.570952,0.94186,0.94186,0.94186
dbscan_500_km,0.54619,0.548911,0.561167,0.542068,0.555187,0.540236,0.94186,0.94186,0.94186,0.498982,0.497675,0.51321,0.5241,0.539003,0.515214,0.930233,0.930233,0.930233
MedianCenters,0.334664,0.333936,0.349594,0.378866,0.383309,0.386991,0.889535,0.889535,0.889535,0.539251,0.536098,0.556705,0.537828,0.539752,0.547951,0.94186,0.94186,0.94186
MeanCenters,0.066267,0.082155,0.060855,0.072201,0.091666,0.065854,0.633721,0.633721,0.633721,0.249381,0.284545,0.25735,0.253358,0.293611,0.254362,0.924419,0.924419,0.924419
EllipseCentroids,0.066267,0.082155,0.060855,0.072201,0.091666,0.065854,0.633721,0.633721,0.633721,0.272244,0.321228,0.266836,0.273709,0.330143,0.259137,0.918605,0.918605,0.918605
CircleCentroids,0.066267,0.082155,0.060855,0.072201,0.091666,0.065854,0.633721,0.633721,0.633721,0.260877,0.305747,0.261911,0.261806,0.313866,0.254056,0.924419,0.924419,0.924419


In [24]:
# Check that columns sort in the correct order:
micro_macro_results.columns.sort_values()

Index(['n172_B_F1_', 'n172_B_Pr_', 'n172_B_Re_', 'n172_H_F1_', 'n172_H_Pr_',
       'n172_H_Re_', 'n203_B_F1_', 'n203_B_Pr_', 'n203_B_Re_', 'n203_H_F1_',
       'n203_H_Pr_', 'n203_H_Re_', 'n375_B_F1_', 'n375_B_Pr_', 'n375_B_Re_',
       'n375_H_F1_', 'n375_H_Pr_', 'n375_H_Re_'],
      dtype='object')

In [25]:
# Re-order columns (basic - hierarchical / basic - hierarchical )
micro_macro_results = micro_macro_results[micro_macro_results.columns.sort_values()]

In [26]:
# Re-organize columns ALL SAMPLE - NO SA - ONLY SA
micro_macro_results = micro_macro_results[['n375_B_F1_', 'n375_B_Pr_', 'n375_B_Re_', 'n375_H_F1_','n375_H_Pr_', 'n375_H_Re_', 
                                           'n203_B_F1_', 'n203_B_Pr_', 'n203_B_Re_', 'n203_H_F1_', 'n203_H_Pr_', 'n203_H_Re_',
                                           'n172_B_F1_', 'n172_B_Pr_', 'n172_B_Re_', 'n172_H_F1_','n172_H_Pr_', 'n172_H_Re_'
                                           ]]

In [27]:
micro_macro_results.head()

Unnamed: 0,n375_B_F1_,n375_B_Pr_,n375_B_Re_,n375_H_F1_,n375_H_Pr_,n375_H_Re_,n203_B_F1_,n203_B_Pr_,n203_B_Re_,n203_H_F1_,n203_H_Pr_,n203_H_Re_,n172_B_F1_,n172_B_Pr_,n172_B_Re_,n172_H_F1_,n172_H_Pr_,n172_H_Re_
maxmonths,0.696804,0.690731,0.717977,0.646735,0.641839,0.665741,0.677945,0.673603,0.693082,0.644457,0.641143,0.658297,0.97093,0.97093,0.97093,0.97093,0.97093,0.97093
maxweeks,0.653813,0.647967,0.674035,0.626859,0.617331,0.648049,0.639936,0.639952,0.650052,0.628812,0.625044,0.640599,0.959302,0.959302,0.959302,0.953488,0.953488,0.953488
maxtimedelta,0.652324,0.653433,0.683981,0.580292,0.585785,0.610674,0.629253,0.632221,0.659085,0.558718,0.565925,0.587557,0.97093,0.97093,0.97093,0.97093,0.97093,0.97093
maxdays,0.614785,0.615139,0.624083,0.570203,0.564412,0.589338,0.618665,0.626798,0.61624,0.582075,0.57957,0.594588,0.953488,0.953488,0.953488,0.947674,0.947674,0.947674
maxposts,0.593689,0.603459,0.598832,0.573858,0.582399,0.579194,0.599907,0.619833,0.590526,0.573488,0.58643,0.570952,0.947674,0.947674,0.947674,0.94186,0.94186,0.94186


In [28]:
basic = detect_method_type(weighted_avgs, "B_")
hierarchical = detect_method_type(weighted_avgs, "H_")

weighted_results = basic.join(hierarchical, lsuffix= "basic", rsuffix="hierarchical")

In [29]:
weighted_results = weighted_results[weighted_results.columns.sort_values()]

In [30]:
weighted_results = weighted_results[['n375_B_F1_', 'n375_B_Pr_','n375_B_Re_', 'n375_H_F1_', 'n375_H_Pr_', 'n375_H_Re_',
                                    'n203_B_F1_', 'n203_B_Pr_', 'n203_B_Re_', 'n203_H_F1_','n203_H_Pr_', 'n203_H_Re_', ]]

In [31]:
weighted_results.head()

Unnamed: 0,n375_B_F1_,n375_B_Pr_,n375_B_Re_,n375_H_F1_,n375_H_Pr_,n375_H_Re_,n203_B_F1_,n203_B_Pr_,n203_B_Re_,n203_H_F1_,n203_H_Pr_,n203_H_Re_
maxmonths,0.916217,0.915307,0.92,0.915853,0.91744,0.917333,0.881978,0.891662,0.876847,0.879175,0.891844,0.871921
maxdays,0.907287,0.912243,0.906667,0.905964,0.916847,0.898667,0.876459,0.893263,0.866995,0.873452,0.89621,0.857143
maxweeks,0.90392,0.906211,0.906667,0.907556,0.909014,0.909333,0.86974,0.886106,0.862069,0.879367,0.892819,0.871921
maxposts,0.89044,0.896367,0.890667,0.89138,0.901614,0.885333,0.854693,0.877043,0.842365,0.855985,0.882459,0.837438
maxtimedelta,0.878596,0.88391,0.885333,0.872827,0.884548,0.874667,0.818181,0.846131,0.812808,0.802774,0.838329,0.793103


## Results to file

Print to csv and round up decimals:

In [32]:
#macro_results.to_csv(os.path.join(folder, "f1-scores-macro_avg.csv"), 
#                        sep=";", 
#                        float_format='%.2f')

In [33]:
weighted_results.to_csv(os.path.join(folder, "f1-scores-weighted_avg.csv"), 
                        sep=";", 
                        float_format='%.2f')

In [34]:
micro_macro_results.to_csv(os.path.join(folder, "f1-scores-micro_macro.csv"), 
                        sep=";", 
                        float_format='%.2f')