# Stability of the Temporal Labels

Last, we evaluated the stability of the labeling in time. We assign the labels to host from both training and testing datasets using the cutoff setting . We compare the labels of hosts in the training and testing dataset.

### Imports

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import ipaddress

from IPython.display import clear_output

import calendar

%matplotlib inline

### Load the dataset

In [2]:
file = '../dataset/Flows-anon.csv'
df = pd.read_csv(file, header=[0], index_col=[0])
df.index = pd.to_datetime(df.index)

df.head()

Unnamed: 0,133.250.0.0,133.250.0.1,133.250.0.2,133.250.0.3,133.250.0.4,133.250.0.5,133.250.0.6,133.250.0.7,133.250.0.8,133.250.0.9,...,133.250.255.246,133.250.255.247,133.250.255.248,133.250.255.249,133.250.255.250,133.250.255.251,133.250.255.252,133.250.255.253,133.250.255.254,133.250.255.255
2019-01-01 00:00:00+00:00,,,,,,,48.0,,,,...,,,,,,,,,,
2019-01-01 01:00:00+00:00,,,,,,,55.0,,,,...,,,,,,,,,,
2019-01-01 02:00:00+00:00,,,,,,,43.0,,,,...,,,,,,,,,,
2019-01-01 03:00:00+00:00,,,,,,1.0,43.0,,,,...,,,,,,,,,,
2019-01-01 04:00:00+00:00,,,,,,2.0,71.0,,,1.0,...,,,,,,,,,,


### Definitions

In [3]:
month_start = ['2019-01-01 00:00:00','2019-02-01 00:00:00','2019-03-01 00:00:00','2019-04-01 00:00:00','2019-05-01 00:00:00','2019-06-01 00:00:00','2019-07-01 00:00:00','2019-08-01 00:00:00','2019-09-01 00:00:00','2019-10-01 00:00:00','2019-11-01 00:00:00','2019-12-01 00:00:00']
month_end = ['2019-01-30 23:00:00','2019-02-28 23:00:00', '2019-03-31 23:00:00','2019-04-30 23:00:00','2019-05-31 23:00:00','2019-06-30 23:00:00','2019-07-31 23:00:00','2019-08-31 23:00:00', '2019-09-30 23:00:00','2019-10-31 23:00:00', '2019-11-30 23:00:00', '2019-12-31 23:00:00']

In [4]:
# List of missing observations and days with missing observations
def list_missing_days(df):
    missing_observations = df[df.isnull().all(axis=1)].index
    list_of_missing_days = []
    for obs in missing_observations:
        if obs.round(freq='D') not in list_of_missing_days:
            list_of_missing_days.append(obs.round(freq='D'))
    return list_of_missing_days

In [5]:
def by_day_mean(series):
    "Ignores days, where no communication has been done and do not add them into mean"
    sum = 0
    cnt = 0
    for i in series:
        if i != 0:
            sum = sum+i
            cnt += 1
    if cnt > 0:
        avg = sum/cnt
    else:
        avg = float('nan')
    return avg

def most_frequent_value(series):
    "Returns the most frequent value in the serie"
    return np.bincount(series).argmax()

def label_night_talker_only(row):
    if row['day_talker'] == 0 and row['night_talker']==1:
        return 1
    else: 
        return 0

def label_day_talker_only(row):
    if row['day_talker'] == 1 and row['night_talker']==0:
        return 1
    else: 
        return 0

def label_day_night_talker_unknown(row):
    if row['day_talker'] == 0 and row['night_talker']==0:
        return 1
    else: 
        return 0

    
def weighted_label_day_night_talker(series, hours, statistics):
    decision = 0
    for statistic in statistics:
        if series[statistic] >= hours:
            decision += 1
    if decision >= 3:
        return 1
    else:
        return 0

def weighted_label_business_weekend_talker(row, hours, days, statistics):
    decision = 0
    for statistic in statistics:
        days_talked_statistics = "days_talked_" + statistic
        if row[statistic] >= hours and row[days_talked_statistics]>= days:
            decision += 1
    
    if decision >= 3:
        return 1
    else:
        return 0
    
def labels_stability(row, characteristics):
    if row['train_'+ characteristics] == row['test_'+ characteristics]:
        return 1
    else:
        return 0 


In [6]:
def assign_labels_diurnal(df):
    df['hour'] = df.index.hour

    day_hours = df[(df['hour'] >= 6) & (df['hour'] < 18)]
    night_hours = df[(df['hour'] < 6) | (df['hour'] >= 18)]
    day_hours = day_hours.drop('hour',axis=1)
    night_hours = night_hours.drop('hour',axis=1)
    
    list_of_missing_days = list_missing_days(df)
    
    day_hours_agg = day_hours.resample('D').count()
    day_hours_agg = day_hours_agg.drop(list_of_missing_days)
    night_hours_agg = night_hours.resample('D').count()
    night_hours_agg = night_hours_agg.drop(list_of_missing_days)

    ip_characteristics = pd.DataFrame(index=day_hours_agg.columns)
    ip_characteristics['day_hours_by_day_mean'] = day_hours_agg.apply(by_day_mean)
    ip_characteristics['day_hours_median'] = day_hours_agg.median()
    ip_characteristics['day_hours_mean'] = day_hours_agg.mean()
    ip_characteristics['day_hours_most_frequent_value'] = day_hours_agg.apply(most_frequent_value)

    ip_characteristics['night_hours_by_day_mean'] = night_hours_agg.apply(by_day_mean)
    ip_characteristics['night_hours_median'] = night_hours_agg.median()
    ip_characteristics['night_hours_mean'] = night_hours_agg.mean()
    ip_characteristics['night_hours_most_frequent_value'] = night_hours_agg.apply(most_frequent_value)

    statistics_day = ['day_hours_by_day_mean', 'day_hours_mean', 'day_hours_median', 'day_hours_most_frequent_value']
    statistics_night = ['night_hours_by_day_mean', 'night_hours_mean', 'night_hours_median', 'night_hours_most_frequent_value']


    ip_characteristics['day_talker'] = ip_characteristics.apply(lambda row: weighted_label_day_night_talker(row, 6, statistics_day), axis =1)
    ip_characteristics['night_talker'] = ip_characteristics.apply(lambda row: weighted_label_day_night_talker(row, 5, statistics_night), axis =1)


    ip_characteristics['day_talker_only'] = ip_characteristics.apply(lambda row: label_day_talker_only(row), axis =1)
    ip_characteristics['night_talker_only'] = ip_characteristics.apply(lambda row: label_night_talker_only(row), axis =1)
    ip_characteristics['day_night_talker_unknown'] = ip_characteristics.apply(lambda row: label_day_night_talker_unknown(row), axis =1)

    return ip_characteristics[['day_talker', 'night_talker']]

In [7]:
def assing_labels_weekday(df):
    
    by_day = df.resample('D').count()
    by_day_business = by_day[by_day.index.dayofweek<5]
    by_day_weekend = by_day[by_day.index.dayofweek>=5]
    
    replaced_business = by_day_business.replace(0,float('NaN'))
    replaced_business = replaced_business.reset_index()
    replaced_business['week_number'] = replaced_business['index'].dt.week
    #replaced_business = replaced_business.drop([217,128])
    replaced_business = replaced_business.groupby(by='week_number').count()

    replaced_weekend = by_day_weekend.replace(0,float('NaN'))
    replaced_weekend = replaced_weekend.reset_index()
    replaced_weekend['week_number'] = replaced_weekend['index'].dt.week
    replaced_weekend = replaced_weekend.groupby(by='week_number').count()
    
    ip_characteristics = pd.DataFrame(index=by_day_business.columns)
    ip_characteristics['business_day_by_day_mean'] = by_day_business.apply(by_day_mean)
    ip_characteristics['business_day_median'] = by_day_business.median()
    ip_characteristics['business_day_mean'] = by_day_business.mean()
    ip_characteristics['business_day_most_frequent_value'] = by_day_business.apply(most_frequent_value)


    ip_characteristics['weekend_by_day_mean'] = by_day_weekend.apply(by_day_mean)
    ip_characteristics['weekend_median'] = by_day_weekend.median()
    ip_characteristics['weekend_mean'] = by_day_weekend.mean()
    ip_characteristics['weekend_most_frequent_value'] = by_day_weekend.apply(most_frequent_value)

    statistics_business = ['business_day_by_day_mean','business_day_median', 'business_day_mean', 'business_day_most_frequent_value']
    statistics_weekend = ['weekend_by_day_mean','weekend_median', 'weekend_mean', 'weekend_most_frequent_value']
    
    
    ip_characteristics['days_talked_business_day_by_day_mean'] = replaced_business.apply(by_day_mean)
    ip_characteristics['days_talked_business_day_median'] = replaced_business.median()
    ip_characteristics['days_talked_business_day_mean'] = replaced_business.mean()
    ip_characteristics['days_talked_business_day_most_frequent_value'] = replaced_business.apply(most_frequent_value)


    ip_characteristics['days_talked_weekend_by_day_mean'] = replaced_weekend.apply(by_day_mean)
    ip_characteristics['days_talked_weekend_median'] = replaced_weekend.median()
    ip_characteristics['days_talked_weekend_mean'] = replaced_weekend.mean()
    ip_characteristics['days_talked_weekend_most_frequent_value'] = replaced_weekend.apply(most_frequent_value)

    statistics_business_days_talked = ['days_talked_business_day_by_day_mean','days_talked_business_day_median', 'days_talked_business_day_mean', 'days_talked_business_day_most_frequent_value']
    statistics_weekend_days_talked = ['days_talked_weekend_by_day_mean','days_talked_weekend_median', 'days_talked_weekend_mean', 'days_talked_weekend_most_frequent_value']

    ip_characteristics['business_day_talker'] = ip_characteristics.apply(lambda row: weighted_label_business_weekend_talker(row,8,3,statistics_business), axis =1)
    ip_characteristics['weekend_talker'] = ip_characteristics.apply(lambda row: weighted_label_business_weekend_talker(row,8,1, statistics_weekend), axis =1)
    
    return ip_characteristics[['business_day_talker','weekend_talker']]

### Evaluate the label stability

#### Day night pattern

Assign diurnal labels to the training and testing dataset

In [8]:
labels_training = assign_labels_diurnal(df[month_start[0]:month_end[5]])
labels_testing = assign_labels_diurnal(df[month_start[9]:month_end[10]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
# Join the results of labeling
labels_training = labels_training.add_prefix('train_')
labels_testing= labels_testing.add_prefix('test_')
result = pd.concat([labels_training, labels_testing], axis=1)

In [10]:
# Print counts of labels
for column in result.columns:
    print("\nLabels: ", column)
    print(result[column].value_counts())


Labels:  train_day_talker
0    54199
1    11337
Name: train_day_talker, dtype: int64

Labels:  train_night_talker
0    58522
1     7014
Name: train_night_talker, dtype: int64

Labels:  test_day_talker
0    56230
1     9306
Name: test_day_talker, dtype: int64

Labels:  test_night_talker
0    59113
1     6423
Name: test_night_talker, dtype: int64


In [11]:
# Evaluate the label stability
result['stability_night'] = result.apply(lambda row: labels_stability(row,'night_talker') ,axis=1)
result['stability_day'] = result.apply(lambda row: labels_stability(row,'day_talker') ,axis=1)

print("\nNight Talker = 1:\n", result[result['train_night_talker']==1]['stability_night'].value_counts())
print("\nNight Talker = 0:\n",result[result['train_night_talker']==0]['stability_night'].value_counts())
print("\nDay Talker = 1:\n",result[result['train_day_talker']==1]['stability_day'].value_counts())
print("\nDay Talker = 0:\n",result[result['train_day_talker']==0]['stability_day'].value_counts())


Night Talker = 1:
 1    5556
0    1458
Name: stability_night, dtype: int64

Night Talker = 0:
 1    57655
0      867
Name: stability_night, dtype: int64

Day Talker = 1:
 1    8488
0    2849
Name: stability_day, dtype: int64

Day Talker = 0:
 1    53381
0      818
Name: stability_day, dtype: int64


In [12]:
from sklearn.metrics import  classification_report
print(classification_report(result['train_day_talker'], result['test_day_talker']))
print(classification_report(result['train_night_talker'], result['test_night_talker']))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97     54199
           1       0.91      0.75      0.82     11337

    accuracy                           0.94     65536
   macro avg       0.93      0.87      0.89     65536
weighted avg       0.94      0.94      0.94     65536

              precision    recall  f1-score   support

           0       0.98      0.99      0.98     58522
           1       0.87      0.79      0.83      7014

    accuracy                           0.96     65536
   macro avg       0.92      0.89      0.90     65536
weighted avg       0.96      0.96      0.96     65536



#### Business day/ weekend pattern

Assign labels to the training and testing dataset

In [13]:
labels_training = assing_labels_weekday(df[month_start[0]:month_end[5]])
labels_testing = assing_labels_weekday(df[month_start[9]:month_end[10]])

In [14]:
# Join the results of labeling
labels_training = labels_training.add_prefix('train_')
labels_testing= labels_testing.add_prefix('test_')
result = pd.concat([labels_training, labels_testing], axis=1)

In [15]:
# Print counts of labels
for column in result.columns:
    print("\nLabels: ", column)
    print(result[column].value_counts())


Labels:  train_business_day_talker
0    49850
1    15686
Name: train_business_day_talker, dtype: int64

Labels:  train_weekend_talker
0    57527
1     8009
Name: train_weekend_talker, dtype: int64

Labels:  test_business_day_talker
0    51340
1    14196
Name: test_business_day_talker, dtype: int64

Labels:  test_weekend_talker
0    59227
1     6309
Name: test_weekend_talker, dtype: int64


In [16]:
# Evaluate the label stability
result['stability_business'] = result.apply(lambda row: labels_stability(row,'business_day_talker') ,axis=1)
result['stability_weekend'] = result.apply(lambda row: labels_stability(row,'weekend_talker') ,axis=1)

print(result[result['train_business_day_talker']==1]['stability_business'].value_counts())
print(result[result['train_business_day_talker']==0]['stability_business'].value_counts())
print(result[result['train_weekend_talker']==1]['stability_weekend'].value_counts())
print(result[result['train_weekend_talker']==0]['stability_weekend'].value_counts())

1    11606
0     4080
Name: stability_business, dtype: int64
1    47260
0     2590
Name: stability_business, dtype: int64
1    5391
0    2618
Name: stability_weekend, dtype: int64
1    56609
0      918
Name: stability_weekend, dtype: int64


In [17]:
from sklearn.metrics import  classification_report
print(classification_report(result['train_business_day_talker'], result['test_business_day_talker']))
print(classification_report(result['train_weekend_talker'], result['test_weekend_talker']))

              precision    recall  f1-score   support

           0       0.92      0.95      0.93     49850
           1       0.82      0.74      0.78     15686

    accuracy                           0.90     65536
   macro avg       0.87      0.84      0.86     65536
weighted avg       0.90      0.90      0.90     65536

              precision    recall  f1-score   support

           0       0.96      0.98      0.97     57527
           1       0.85      0.67      0.75      8009

    accuracy                           0.95     65536
   macro avg       0.91      0.83      0.86     65536
weighted avg       0.94      0.95      0.94     65536

