In [1]:
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [2]:
import pandas as pd
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
# from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn.model_selection import cross_val_score
import pickle
import pprint
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [3]:
root = "prototype/"
feature_file = root+"ss_features.csv"
features = pd.read_csv(feature_file)
feature_list = list(features.feature.values)
feature_list

['receiver_avg_waittime_md',
 'receiver_ost_read',
 'receiver_ost_write',
 'receiver_req_active',
 'receiver_seg_out',
 'receiver_segs_in',
 'receiver_system_cpu_percent',
 'receiver_system_memory_percent',
 'sender_avg_waittime_md',
 'sender_cwnd_rate',
 'sender_mds_close_md',
 'sender_ost_read',
 'sender_remote_ost_read_bytes',
 'sender_req_active',
 'sender_req_waittime',
 'sender_req_waittime_md',
 'sender_retrans',
 'sender_segs_in',
 'sender_send_buffer_value',
 'sender_ssthresh_value',
 'sender_system_cpu_percent',
 'sender_system_memory_percent',
 'sender_tcp_snd_buffer_max',
 'sender_write_bytes_io']

In [4]:
filenames = {
    "utah_ssd": "UTAH_25Gbps_ssd_merged.csv",
#     "hpcn_ssd": "hpcn_1Gbps_ssd_merged.csv",
    "wiscon_ssd": "WISCONSIN_10Gbps_ssd_merged.csv",
    "wiscon_hdd": "WISCONSIN_10Gbps_hdd_merged.csv"
}

data = {}

In [5]:
def read_file(transform=False):
    global data, filenames
    
    for key in filenames:      
        df = pd.read_csv(root+filenames[key])
        df = df.sort_values(by=['label_value']).reset_index().drop(columns=["index"])
        df.fillna(df.groupby(['label_value'], as_index=False).mean(), inplace=True)
    
        if transform:
            for index, row in features.iterrows():
                df[row['feature']] = df[row['feature']] / df[row['normalizer']]

        df.replace([np.inf, -np.inf, np.nan], 0, inplace=True)
        data[key] = df

read_file()

## Cross-Validations

In [21]:
for key in data:
    df = data[key].copy()
    y = df.label_value
    X = df[feature_list]
    clf = make_pipeline(RandomForestClassifier(n_estimators=100)) #RandomForestClassifier(n_estimators=100)
    cv_res = np.round(cross_val_score(clf, X, y, cv=5),4)
    print(f'{key} ==>{cv_res}')
    print(f'{key} ==> mean: {np.round(np.mean(cv_res)*100, 2)}, std: {np.round(np.std(cv_res)*100, 2)}')

utah_ssd ==>[0.9424 0.992  0.9909 0.9789 0.9914]
utah_ssd ==> mean: 97.91, std: 1.9
wiscon_ssd ==>[0.9479 0.9789 0.9865 0.9772 0.9924]
wiscon_ssd ==> mean: 97.66, std: 1.53
wiscon_hdd ==>[0.95   0.9916 0.9777 0.971  0.9753]
wiscon_hdd ==> mean: 97.31, std: 1.35


## Evaluation

In [18]:
for key in data:
    df = data[key].copy()
    df = df.sample(frac=0.5)
    y = df.label_value
    X = df[feature_list] #.drop(columns="label_value")
    X_train, X_test, y_train, y_test = train_test_split(X,y)
    X_train, y_train = RandomUnderSampler(sampling_strategy="all").fit_resample(X_train, y_train)
    print(X_train.shape)
    
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    # pickle.dump(clf, open(f"{root}{key}.model", "wb"))
    # clf = pickle.load(open(f"{root}{key}.model", "rb"))
    y_pred = clf.predict(X_test)
    np.round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
    print(f"{key} ==> {np.round(metrics.accuracy_score(y_test, y_pred) * 100, 2)}")

(342, 24)
utah_ssd ==> 96.99
(630, 24)
wiscon_ssd ==> 96.82
(558, 24)
wiscon_hdd ==> 96.05


## Features Importance List

In [82]:
features_set = set()
for key in data:
    df = data[key].copy()
    y = df.label_value
    X = df.drop(columns="label_value")
    X_train, y_train = RandomUnderSampler(sampling_strategy="all").fit_resample(X, y)
    
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    score = np.round(clf.feature_importances_, 2)
    
    for i in range(len(score)):
        if score[i] >= 0.02:
            features_set.add(X.columns[i])

for val in sorted(features_set):
    print(f"{val},")

receiver_avg_waittime_md,
receiver_ost_read,
receiver_ost_write,
receiver_req_active,
receiver_seg_out,
receiver_segs_in,
receiver_system_cpu_percent,
receiver_system_memory_percent,
sender_avg_waittime_md,
sender_cwnd_rate,
sender_ost_read,
sender_read_bytes,
sender_remote_ost_read_bytes,
sender_req_active,
sender_req_waittime,
sender_req_waittime_md,
sender_retrans,
sender_segs_in,
sender_send_buffer_value,
sender_ssthresh_value,
sender_system_cpu_percent,
sender_system_memory_percent,
sender_tcp_snd_buffer_max,
sender_write_bytes_io,


## Transfer Learning

In [85]:
for key1 in data:
    df = data[key1].copy()
    y = df.label_value
    X = df[feature_list] #.drop(columns="label_value")
    X_train, X_test, y_train, y_test = train_test_split(X,y)
    X_train, y_train = RandomUnderSampler(sampling_strategy="all").fit_resample(X_train, y_train)
    
    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(X_train, y_train)
    
    for key2 in data:
        if key1 == key2:
            continue
            
        df_test = data[key2].copy()
        y_test = df_test.label_value
        X_test = df_test[feature_list]
        
        y_pred = clf.predict(X_test)
        np.round(metrics.accuracy_score(y_test, y_pred) * 100, 2)
        print(f"{key1} ==> {key2} - {np.round(metrics.accuracy_score(y_test, y_pred) * 100, 2)}")

utah_ssd ==> wiscon_ssd - 43.87
utah_ssd ==> wiscon_hdd - 43.74
wiscon_ssd ==> utah_ssd - 55.51
wiscon_ssd ==> wiscon_hdd - 80.03
wiscon_hdd ==> utah_ssd - 63.43
wiscon_hdd ==> wiscon_ssd - 80.88


In [51]:
np.round(df.groupby("label_value").sender_remote_ost_read_bytes.agg(["mean", "std"]),2)

Unnamed: 0_level_0,mean,std
label_value,Unnamed: 1_level_1,Unnamed: 2_level_1
0,115795700.0,10140063.12
1,546173000.0,12309885.69
17,111754500.0,11425497.81
33,115762800.0,14963686.55
34,111107900.0,12641538.56
35,115122000.0,11359041.02
36,98106770.0,20922373.95
40,95858370.0,29320049.29
44,110662800.0,10064329.54
48,106917100.0,18872234.76


In [8]:
for key in data:
    print(key)

utah_ssd
wiscon_ssd
wiscon_hdd


In [9]:
df = data["utah_ssd"].copy()

In [13]:
df.groupby("label_value").sender_avg_rtt_value.count()

label_value
0      109
1      661
17    1768
33     410
34     109
35      51
36     438
40     440
44     438
48     879
59     328
62     439
66     888
74     881
82     414
86     111
87      69
88     331
Name: sender_avg_rtt_value, dtype: int64

In [11]:
df.columns

Index(['sender_avg_rtt_value', 'sender_pacing_rate', 'sender_cwnd_rate',
       'sender_avg_retransmission_timeout_value', 'sender_byte_ack',
       'sender_seg_out', 'sender_retrans', 'sender_ssthresh_value',
       'sender_segs_in', 'sender_avg_send_value',
       ...
       'receiver_setattr_md', 'receiver_read_page_md', 'receiver_unlink_md',
       'receiver_avg_dsack_dups_value', 'receiver_avg_reord_seen',
       'receiver_system_cpu_percent', 'receiver_system_memory_percent',
       'receiver_remote_ost_read_bytes', 'receiver_remote_ost_write_bytes',
       'label_value'],
      dtype='object', length=117)