In [1]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler

from sklearn.model_selection import train_test_split

from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import StandardScaler

from pyts.image import GramianAngularField


In [19]:
file_path = 'gasf_ipp_data.csv'
df=pd.read_csv(file_path)

In [3]:
df.columns

Index(['id', 'dur', 'proto', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes',
       'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt',
       'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt',
       'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'label', 'service_-', 'service_dhcp',
       'service_dns', 'service_ftp', 'service_ftp-data', 'service_http',
       'service_irc', 'service_pop3', 'service_radius', 'service_smtp',
       'service_snmp', 'service_ssh', 'service_ssl', 'proto_tcp', 'proto_udp'],
      dtype='object')

In [4]:
# Calculate individual thresholds
udp_thresholds = {
    'spkts': 1.094848e+03,
    'dpkts': 1.561638e+03,
    'sbytes': 1.351127e+06,
    'dbytes': 2.059159e+06
}

# Combine thresholds using the maximum value
composite_threshold = max(udp_thresholds.values())

print("Composite Threshold:", composite_threshold)


Composite Threshold: 2059159.0


In [5]:
tcp_columns = ['spkts', 'dpkts', 'sbytes', 'dbytes', 'sttl', 'dttl', 'tcprtt']
udp_columns = ['spkts', 'dpkts', 'sbytes', 'dbytes']
aggregate_columns = ['rate', 'sload', 'dload', 'sinpkt', 'dinpkt']

In [6]:
percentiles_95 = df[udp_columns].quantile(0.95)

# Set the threshold as the maximum of the 95th percentiles
threshold = percentiles_95.max()

print("Threshold for detecting abnormal peaks in UDP-related columns:", threshold)


Threshold for detecting abnormal peaks in UDP-related columns: 61084.0


In [7]:
udp_means = df[udp_columns].mean()
udp_stds = df[udp_columns].std()

# Calculate the thresholds for each UDP column
udp_thresholds = udp_means + 3 * udp_stds

# Print the thresholds
print(udp_thresholds)

spkts     1.094848e+03
dpkts     1.561638e+03
sbytes    1.351127e+06
dbytes    2.059159e+06
dtype: float64


In [8]:
udp_means

spkts        52.684907
dpkts        59.355460
sbytes    35768.809236
dbytes    57239.552093
dtype: float64

In [9]:
threshold=0.5

In [10]:

# Assuming `df` is your DataFrame containing the dataset

# Define columns for feature extraction
tcp_columns = ['spkts', 'dpkts', 'sbytes', 'dbytes', 'sttl', 'dttl', 'tcprtt']
udp_columns = ['spkts', 'dpkts', 'sbytes', 'dbytes']
aggregate_columns = ['rate', 'sload', 'dload', 'sinpkt', 'dinpkt']

def calculate_cov(data):
    mean = np.mean(data)
    std_dev = np.std(data)
    cov = std_dev / mean if mean != 0 else 0
    return cov

def calc_tcp_cov(row):
    tcp_values = row[tcp_columns]
    return calculate_cov(tcp_values)

# def count_abnormal_peaks(data, threshold):
#     mean = np.mean(data)
#     std_dev = np.std(data)
#     if std_dev == 0:  # Handle case where standard deviation is 0
#         return 0
#     peak_threshold = mean + std_dev
#     abnormal_peaks = np.sum(data > peak_threshold)
#     return abnormal_peaks

# def calc_udp_peaks(row, threshold):
#     udp_values = row[udp_columns]
#     return count_abnormal_peaks(udp_values, threshold)

def calc_udp_abnormal_peaks(row, threshold):
    udp_values = row[udp_columns]
    abnormal_peaks = np.sum(udp_values > threshold)
    return abnormal_peaks

def calc_gasf_matrix(row):
    aggregate_values = row[aggregate_columns].values.reshape(1, -1)
    gasf = GramianAngularField(image_size=len(aggregate_columns), method='summation')
    return gasf.fit_transform(aggregate_values)[0]

def calc_gasf_moments(gasf_matrix):
    if np.isnan(gasf_matrix).any() or np.std(gasf_matrix) == 0:  # Check for NaN values or standard deviation being 0
        return np.nan, np.nan, np.nan
    mean = np.mean(gasf_matrix)
    std_dev = np.std(gasf_matrix)
    # skewness = np.mean((gasf_matrix - mean) ** 3) ** (1/3)
    skewness = np.nanmean(np.cbrt((gasf_matrix - mean) ** 3))

    return mean, std_dev, skewness

def weighted_sum_moments(mean, std_dev, skewness, w1, w2, w3):
    if np.isnan(mean) or np.isnan(std_dev) or np.isnan(skewness):
        return np.nan  # Return NaN if any moment is NaN
    return mean * w1 + std_dev * w2 + skewness * w3

# Add new columns to DataFrame with calculated features
df['cal_tcp'] = df.apply(calc_tcp_cov, axis=1)
df['cal_udp'] = df.apply(lambda row: calc_udp_abnormal_peaks(row, threshold ), axis=1)
df['gasf_matrix'] = df.apply(calc_gasf_matrix, axis=1)
df[['moment1', 'moment2', 'moment3']] = df['gasf_matrix'].apply(lambda x: pd.Series(calc_gasf_moments(x)))
df['score_traffic'] = df.apply(lambda row: weighted_sum_moments(row['moment1'], row['moment2'], row['moment3'], 0.5, 0.3, 0.2), axis=1)

# Features and target labels
X = df[['score_traffic', 'cal_tcp',"cal_udp"]]
y = df['label']  # Assuming 'label' is the column indicating if a row is an anomaly

# # Drop rows with NaN values in the target label (y)
# X = X.dropna(subset=['score_traffic', 'cal_tcp', 'moment1', 'moment2', 'moment3', 'label'])
# y = y.dropna()
# Drop rows with NaN values in the target label (y)
X = X.dropna(subset=['score_traffic', 'cal_tcp',"cal_udp"])
y = y.dropna()

# Merge X and y back together
df_cleaned = pd.concat([X, y], axis=1)

# Drop rows with NaN values in the target label (y)
df_cleaned = df_cleaned.dropna(subset=['score_traffic', 'cal_tcp', "cal_udp",'label'])

# Split the data again after dropping NaN values
X = df_cleaned[['score_traffic', 'cal_tcp',"cal_udp"]]
y = df_cleaned['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


X_test.head(3)

Unnamed: 0,score_traffic,cal_tcp,cal_udp
33693,0.459755,1.951241,2.0
20359,0.459755,1.951241,2.0
30127,0.45835,1.801726,2.0


In [11]:
X_train.head(3)

Unnamed: 0,score_traffic,cal_tcp,cal_udp
39370,0.300895,1.142877,4.0
65559,0.408106,2.112326,4.0
70107,0.459765,2.302339,2.0


In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)



Accuracy: 0.978499127399651
Confusion Matrix:
 [[6907  292]
 [  16 7110]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.96      0.98      7199
           1       0.96      1.00      0.98      7126

    accuracy                           0.98     14325
   macro avg       0.98      0.98      0.98     14325
weighted avg       0.98      0.98      0.98     14325



In [13]:
X_train.head(3)

Unnamed: 0,score_traffic,cal_tcp,cal_udp
39370,0.300895,1.142877,4.0
65559,0.408106,2.112326,4.0
70107,0.459765,2.302339,2.0


In [14]:
X_train_tcp = X_train[['cal_tcp']]
y_train_tcp = y_train  # Assuming 'label' is the column indicating if a row is an anomaly

X_test_tcp = X_test[['cal_tcp']]
y_test_tcp = y_test 

# # Drop rows with NaN values in the target label (y_tcp)
# X_tcp = X_tcp.dropna(subset=['cal_tcp'])
# y_tcp = y_tcp.dropna()

# Merge X and y back together for TCP
# df_cleaned_tcp = pd.concat([X_tcp, y_tcp], axis=1)

# Drop rows with NaN values in the target label (y_tcp)
# df_cleaned_tcp = df_cleaned_tcp.dropna(subset=['cal_tcp', 'label'])

# Split the data for TCP
# X_train_tcp, X_test_tcp, y_train_tcp, y_test_tcp = train_test_split(df_cleaned_tcp[['cal_tcp']], df_cleaned_tcp['label'], test_size=0.2, random_state=42)

# Initialize the KNN classifier for TCP
knn_tcp = KNeighborsClassifier(n_neighbors=5)

# Fit the model for TCP
knn_tcp.fit(X_train_tcp, y_train_tcp)

# Make predictions on the test set for TCP
y_pred_tcp = knn_tcp.predict(X_test_tcp)

X_test["y_pred_tcp"]=y_pred_tcp



# Evaluate the model for TCP
accuracy_tcp = accuracy_score(y_test_tcp, y_pred_tcp)
conf_matrix_tcp = confusion_matrix(y_test_tcp, y_pred_tcp)
class_report_tcp = classification_report(y_test_tcp, y_pred_tcp)

print("TCP - Accuracy:", accuracy_tcp)
print("TCP - Confusion Matrix:\n", conf_matrix_tcp)
print("TCP - Classification Report:\n", class_report_tcp)


TCP - Accuracy: 0.9713089005235602
TCP - Confusion Matrix:
 [[6804  395]
 [  16 7110]]
TCP - Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.95      0.97      7199
           1       0.95      1.00      0.97      7126

    accuracy                           0.97     14325
   macro avg       0.97      0.97      0.97     14325
weighted avg       0.97      0.97      0.97     14325



In [15]:
 # Separate features and labels for Score Traffic
X_train_score_traffic = X_train[['score_traffic']]
y_train_score_traffic = y_train  

X_test_score_traffic = X_test[['score_traffic']]
y_test_score_traffic = y_test 

# Initialize the KNN classifier for Score Traffic
knn_score_traffic = KNeighborsClassifier(n_neighbors=5)

# Fit the model for Score Traffic
knn_score_traffic.fit(X_train_score_traffic, y_train_score_traffic)

# Make predictions on the test set for Score Traffic
y_pred_score_traffic = knn_score_traffic.predict(X_test_score_traffic)

# Add Score Traffic predictions to X_test
X_test["y_pred_score_traffic"] = y_pred_score_traffic

# Separate features and labels for UDP
X_train_udp = X_train[['cal_udp']]
y_train_udp = y_train  

X_test_udp = X_test[['cal_udp']]
y_test_udp = y_test 

# Set the threshold for UDP
thresholdPeak = 1

# Calculate UDP predictions
y_pred_udp = X_test_udp['cal_udp'] > thresholdPeak

# Add UDP predictions to X_test
X_test["y_pred_udp"] = y_pred_udp

# Apply the final algorithm
X_test['Result'] = (X_test['y_pred_tcp'] & X_test['y_pred_score_traffic'] & X_test['y_pred_udp'])

# Evaluate the final algorithm
accuracy = accuracy_score(y_test, X_test['Result'])
conf_matrix = confusion_matrix(y_test, X_test['Result'])
class_report = classification_report(y_test, X_test['Result'])

print("Accuracy for final algorithm:", accuracy)
print("Confusion Matrix for final algorithm:\n", conf_matrix)
print("Classification Report for final algorithm:\n", class_report)

Accuracy for final algorithm: 0.992739965095986
Confusion Matrix for final algorithm:
 [[7111   88]
 [  16 7110]]
Classification Report for final algorithm:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      7199
           1       0.99      1.00      0.99      7126

    accuracy                           0.99     14325
   macro avg       0.99      0.99      0.99     14325
weighted avg       0.99      0.99      0.99     14325



In [16]:
X_test.head()

Unnamed: 0,score_traffic,cal_tcp,cal_udp,y_pred_tcp,y_pred_score_traffic,y_pred_udp,Result
33693,0.459755,1.951241,2.0,0,0,True,False
20359,0.459755,1.951241,2.0,0,0,True,False
30127,0.45835,1.801726,2.0,0,0,True,False
47981,0.360944,1.566381,4.0,1,1,True,True
14970,0.305914,1.693494,4.0,0,0,True,False


In [17]:
# X_test["Result"].unique()
# X_test["Result"].value_counts()[True]
# y_test.value_counts()[1]

In [18]:
import pandas as pd
from sklearn.metrics import classification_report

# Assuming y_test contains the actual labels and X_test['Result'] contains the detected labels
comparison_df = pd.DataFrame({'Actual': y_test, 'Detected': X_test['Result']})

# Calculate accuracy
accuracy = (comparison_df['Actual'] == comparison_df['Detected']).mean() * 100
print("Accuracy:", accuracy)

# Print classification report
report = classification_report(comparison_df['Actual'], comparison_df['Detected'])
print("Classification Report:\n", report)


Accuracy: 99.2739965095986
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      7199
           1       0.99      1.00      0.99      7126

    accuracy                           0.99     14325
   macro avg       0.99      0.99      0.99     14325
weighted avg       0.99      0.99      0.99     14325

