<a href="https://colab.research.google.com/github/Ananya-Ahuja/Anomalyze/blob/ananya_ahuja/Data_transfer_patterns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Pre-processing of Datasets

In [2]:
import pandas as pd
import glob

In [3]:
# Define the folder containing your datasets
data_folder = '/content/datasets/'  # Update this path as needed

# List of CSV files to combine
files = glob.glob(data_folder + '*.csv')

# Columns to keep for data transfer pattern models
combined_cols = ['uplink_volume', 'downlink_volume', 'total_volume',
    'session_duration', 'bytes_per_second', 'uplink_ratio','is_anomaly'
]
transfer_cols = [
    'uplink_volume', 'downlink_volume', 'total_volume',
    'session_duration', 'bytes_per_second', 'uplink_ratio'
]

# Combine datasets, keeping only the relevant columns
df_list = []
for file in files:
    df = pd.read_csv(file, low_memory=False)
    # Only keep columns that are present in this file and in transfer_cols
    cols_present = [col for col in combined_cols if col in df.columns]
    df = df[cols_present]
    # Add missing columns as NaN for consistency
    for col in transfer_cols:
        if col not in df.columns:
            df[col] = pd.NA
    # Reorder columns
    df = df[combined_cols]
    df_list.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(df_list, ignore_index=True)

# Drop rows with missing required values (optional, but recommended)
combined_df = combined_df.dropna(subset=['uplink_volume', 'downlink_volume', 'total_volume', 'session_duration'])
print("Combined dataset shape:", combined_df.shape)

#Creating the Dataframe to put in the model by dropping the is_anomaly column
model_df = combined_df.drop(columns=['is_anomaly'])


Combined dataset shape: (70000, 7)


  combined_df = pd.concat(df_list, ignore_index=True)


### Isolation Forest Model

In [4]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

In [5]:
# Prepare features for modeling
features = model_df[transfer_cols].astype(float).fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(features)

# Fit Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(X_scaled)

# Predict anomalies and get anomaly scores
model_df['iso_anomaly_score'] = iso_forest.decision_function(X_scaled)
model_df['iso_anomaly'] = iso_forest.predict(X_scaled)  # -1 = anomaly, 1 = normal

# Count anomalies
print("Isolation Forest anomaly counts:")
print(model_df['iso_anomaly'].value_counts())

Isolation Forest anomaly counts:
iso_anomaly
 1    66500
-1     3500
Name: count, dtype: int64


### K-Means Model

In [6]:
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min

In [7]:
# K-Means clustering (choose k=2 for normal vs anomaly)
kmeans = KMeans(n_clusters=2, random_state=42)
model_df['kmeans_label'] = kmeans.fit_predict(X_scaled)

# Compute distance to nearest cluster center
_, distances = pairwise_distances_argmin_min(X_scaled, kmeans.cluster_centers_)
threshold = distances.mean() + 2 * distances.std()  # You may tune this
model_df['kmeans_anomaly'] = (distances > threshold).astype(int)

print("K-Means anomaly counts:")
print(model_df['kmeans_anomaly'].value_counts())

K-Means anomaly counts:
kmeans_anomaly
0    68419
1     1581
Name: count, dtype: int64


In [8]:
combined_df.to_csv('/content/combined_data_transfer_anomaly_results.csv', index=False)
print("Results saved to /content/combined_data_transfer_anomaly_results.csv")

Results saved to /content/combined_data_transfer_anomaly_results.csv


### Testing Both Models

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [10]:
# Convert Isolation Forest output (-1/1) to binary (1=anomaly, 0=normal)
model_df['iso_anomaly_binary'] = model_df['iso_anomaly'].map({1: 0, -1: 1})

In [11]:
# Prepare ground truth and predictions
true_labels = combined_df['is_anomaly'].astype(int)
iso_pred = model_df['iso_anomaly_binary']
kmeans_pred = model_df['kmeans_anomaly'].astype(int)

In [12]:
# Evaluate Isolation Forest
iso_accuracy = accuracy_score(true_labels, iso_pred)
iso_precision = precision_score(true_labels, iso_pred)
iso_recall = recall_score(true_labels, iso_pred)
iso_f1 = f1_score(true_labels, iso_pred)
iso_report = classification_report(true_labels, iso_pred, target_names=['Normal', 'Anomaly'])

In [13]:
# Evaluate K-Means
kmeans_accuracy = accuracy_score(true_labels, kmeans_pred)
kmeans_precision = precision_score(true_labels, kmeans_pred)
kmeans_recall = recall_score(true_labels, kmeans_pred)
kmeans_f1 = f1_score(true_labels, kmeans_pred)
kmeans_report = classification_report(true_labels, kmeans_pred, target_names=['Normal', 'Anomaly'])

In [14]:
print("=== Isolation Forest ===")
print(f"Accuracy: {iso_accuracy:.3f}")
print(f"Precision: {iso_precision:.3f}")
print(f"Recall: {iso_recall:.3f}")
print(f"F1 Score: {iso_f1:.3f}")
print(iso_report)

=== Isolation Forest ===
Accuracy: 0.923
Precision: 0.233
Recall: 0.233
F1 Score: 0.233
              precision    recall  f1-score   support

      Normal       0.96      0.96      0.96     66500
     Anomaly       0.23      0.23      0.23      3500

    accuracy                           0.92     70000
   macro avg       0.60      0.60      0.60     70000
weighted avg       0.92      0.92      0.92     70000



In [15]:
print("=== K-Means ===")
print(f"Accuracy: {kmeans_accuracy:.3f}")
print(f"Precision: {kmeans_precision:.3f}")
print(f"Recall: {kmeans_recall:.3f}")
print(f"F1 Score: {kmeans_f1:.3f}")
print(kmeans_report)

=== K-Means ===
Accuracy: 0.943
Precision: 0.339
Recall: 0.153
F1 Score: 0.211
              precision    recall  f1-score   support

      Normal       0.96      0.98      0.97     66500
     Anomaly       0.34      0.15      0.21      3500

    accuracy                           0.94     70000
   macro avg       0.65      0.57      0.59     70000
weighted avg       0.93      0.94      0.93     70000

