In [2]:
pip install torch transformers faiss-cpu pandas

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp310-cp310-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
import faiss
import numpy as np
import gensim
from gensim.models import Word2Vec
import time
import psutil
import os

In [9]:
csv_path = "/kaggle/input/pcap-2019-dira-125910/dirA.125910-packets.csv"
df = pd.read_csv(csv_path, header=None, names=["timestamp", "src_ip", "dst_ip", "protocol", "size"])

# Extract sequences of IP addresses for training
df["ip_sequence"] = df["src_ip"] + " " + df["dst_ip"]

# Tokenize sequences (IP flows)
ip_sequences = [flow.split() for flow in df["ip_sequence"]]

# Train IP2Vec using Word2Vec
ip2vec_model = Word2Vec(sentences=ip_sequences, vector_size=128, window=5, min_count=1, workers=4)
ip2vec_model.save("ip2vec.model")  # Save model for later use

# Generate embeddings for each IP in the dataset
unique_ips = list(set(df["src_ip"].tolist() + df["dst_ip"].tolist()))
ip_embeddings = {ip: ip2vec_model.wv[ip] for ip in unique_ips if ip in ip2vec_model.wv}

# Convert to FAISS index
embeddings = np.array(list(ip_embeddings.values()))
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

# Create FAISS index
dimension = embeddings.shape[1]  # 128-dimensional embeddings
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

print(f"FAISS index contains {index.ntotal} unique IP embeddings.")

# Save FAISS index
faiss.write_index(index, "ip_embeddings.index")

# Save IP metadata for lookup
ip_metadata = pd.DataFrame({"ip": list(ip_embeddings.keys())})
ip_metadata.to_csv("ip_metadata.csv", index=False)

print("IP2Vec model, FAISS index, and metadata saved.")

  df = pd.read_csv(csv_path, header=None, names=["timestamp", "src_ip", "dst_ip", "protocol", "size"])


FAISS index contains 142477 unique IP embeddings.
IP2Vec model, FAISS index, and metadata saved.


In [10]:
ip2vec_model = Word2Vec.load("ip2vec.model")

index = faiss.read_index("ip_embeddings.index")
ip_metadata = pd.read_csv("ip_metadata.csv")

In [12]:
# IP metadata to dictionary for lookup
ip_to_index = {ip: i for i, ip in enumerate(ip_metadata["ip"])}

## Performance logging

In [14]:
import time
import faiss
import numpy as np
import pandas as pd
import psutil
from gensim.models import Word2Vec
import os

In [7]:
def measure(pid, func, *args, **kwargs):
    process = psutil.Process(pid)
    
    start_cpu = process.cpu_percent(interval=None)
    start_mem = process.memory_info().rss / (1024 ** 2)  # Convert to MB
    start_time = time.time()
    
    result = func(*args, **kwargs)
    
    end_cpu = process.cpu_percent(interval=None)
    end_mem = process.memory_info().rss / (1024 ** 2)  # Convert to MB
    end_time = time.time()
    
    cpu_usage = end_cpu - start_cpu
    mem_usage = end_mem - start_mem
    execution_time = end_time - start_time

    print(f"Function: {func.__name__} | Time: {execution_time:.4f}s | CPU: {cpu_usage:.2f}% | Mem: {mem_usage:.2f}MB")
    return result

### 1. Queryng a new packet
Steps: 
1. Convert the query packet into text format.
2. Generate its BERT embedding.
3. Normalize the embedding (since FAISS works best with normalized vectors).
4. Search the FAISS index for the k-nearest neighbors.
5. Return the top-k results with their distances (lower = more similar).

In [31]:
def query_faiss(index, query_embedding, k=5):
    distances, indices = index.search(query_embedding, k)
    return indices

In [36]:
query_ip = "26.120.99.176" #this is the first entry in the dataset
# query_ip = "192.168.1.1" # this is not in the dataset

if query_ip in ip2vec_model.wv:
    query_embedding = ip2vec_model.wv[query_ip].reshape(1, -1)
    query_embedding = query_embedding / np.linalg.norm(query_embedding, axis=1, keepdims=True)
    
    pid = os.getpid()
    query_result = measure(pid, query_faiss, index, query_embedding, 5)
    
    similar_ips = [ip_metadata.iloc[i]["ip"] for i in query_result[0]]
    print(f"Top 5 similar IPs to {query_ip}: {similar_ips}")
else:
    print(f"IP {query_ip} not found in model.")


New IPs found: []
New Embeddings Shape: No embeddings
Function: query_faiss | Time: 0.0071s | CPU: 140.80% | Mem: 0.00MB
Top 5 similar IPs to 26.120.99.176: ['26.120.99.176', '26.117.166.16', '162.102.61.143', '68.154.105.37', '45.238.180.2']


### 2. Large Insertions
Steps: 
1. Convert 50+ new packets into embeddings.
2. Normalize them.
3. Add to the FAISS index.

In [38]:
def insert_data(index, new_ip_embeddings, new_ips):
    print(f"Inserting {len(new_ips)} IPs into FAISS index")
    index.add(new_ip_embeddings)
    return new_ips


In [40]:
# all_ips = list(ip2vec_model.wv.key_to_index.keys())
# print(f"Total IPs in model: {len(all_ips)}")
# print("Sample IPs:", all_ips[:10])

Total IPs in model: 142477
Sample IPs: ['205.190.20.171', '52.223.227.38', '175.240.28.216', '213.5.4.231', '162.190.82.136', '148.227.29.239', '25.51.107.120', '66.35.161.128', '146.55.105.103', '199.124.196.1']


In [52]:
new_ips = [f"205.190.20.{i}" for i in np.random.choice(range(171), 50, replace=False)]

embedding_dim = ip2vec_model.vector_size  # Get the embedding size
new_embeddings = np.random.rand(len(new_ips), embedding_dim)  # Create random vectors

# Normalize the embeddings
new_embeddings = new_embeddings / np.linalg.norm(new_embeddings, axis=1, keepdims=True)

# print(f"New Random IP: {new_ips}")
# print(f"New embddings shape: {new_embeddings.shape}")

pid = os.getpid()
inserted_ips = measure(pid, insert_data, index, new_embeddings, new_ips)

new_ip_metadata = pd.DataFrame({"ip": new_ips})
ip_metadata = pd.concat([ip_metadata, new_ip_metadata], ignore_index=True)
ip_metadata.to_csv("ip_metadata.csv", index=False)

Inserting 50 IPs into FAISS index
Function: insert_data | Time: 0.0707s | CPU: 84.90% | Mem: 69.62MB


### 3. Large Deletions
**Problem!!!**
**FAISS does not support direct deletion of individual embeddings.**

Workaround:
1. Remove entries from the metadata CSV.
2. Rebuild the FAISS index without the deleted embeddings.

In [71]:
def delete_from_faiss(index, delete_indices, embeddings):
    #delete by rebuilding the index with the remaining embeddings.
    if not isinstance(delete_indices, np.ndarray):
        delete_indices = np.array(delete_indices)

    # Ensure delete_indices are valid
    delete_indices = delete_indices[delete_indices < embeddings.shape[0]]

    # Create a mask for filtering
    mask = np.ones(embeddings.shape[0], dtype=bool)
    mask[delete_indices] = False

    # Select embeddings that are not deleted
    new_embeddings = embeddings[mask]

    # Rebuild the FAISS index w remaining embeddings
    new_index = faiss.IndexFlatL2(new_embeddings.shape[1])
    new_index.add(new_embeddings)

    return new_index, new_embeddings

In [72]:
delete_indices = np.random.choice(embeddings.shape[0], 50, replace=False)

pid = os.getpid()
index, embeddings = measure(pid, delete_from_faiss, index, delete_indices, embeddings)
print(f"Deleted {len(delete_indices)} embeddings from FAISS index.")

Function: delete_from_faiss | Time: 0.0920s | CPU: 97.50% | Mem: 138.75MB
Deleted 50 embeddings from FAISS index.


### 4. Large Updates
Similar to delete + insert.

Just remove old embeddings from FAISS, recompute new ones, and reinsert.

In [68]:
all_ips = list(ip2vec_model.wv.key_to_index.keys())  
valid_ips = all_ips[:50]  
# print(f"Valid IPs: {valid_ips}")

def increment_last_digit(ip):
    parts = ip.split(".")
    parts[-1] = str((int(parts[-1]) + 1) % 256)  
    return ".".join(parts)

modified_ips = [increment_last_digit(ip) for ip in valid_ips]
print(f"Modified IPs: {modified_ips}")

Modified IPs: ['205.190.20.172', '52.223.227.39', '175.240.28.217', '213.5.4.232', '162.190.82.137', '148.227.29.240', '25.51.107.121', '66.35.161.129', '146.55.105.104', '199.124.196.2', '199.124.196.63', '146.55.105.105', '146.55.105.98', '146.55.105.96', '162.160.152.69', '146.55.105.106', '146.55.105.97', '146.55.105.108', '146.55.105.102', '146.55.105.100', '175.241.103.122', '146.55.105.93', '146.55.105.101', '189.55.242.121', '175.240.28.218', '146.55.105.107', '146.55.105.94', '66.132.107.224', '203.127.163.104', '186.57.117.114', '207.213.91.111', '146.55.105.99', '146.55.105.103', '169.67.224.77', '146.55.105.95', '203.127.163.127', '199.124.196.114', '72.106.214.241', '148.227.42.104', '63.148.211.142', '203.127.163.125', '195.230.2.93', '77.108.152.240', '199.124.196.4', '35.121.79.37', '162.160.152.72', '213.90.115.18', '63.141.94.158', '26.120.99.177', '66.132.244.0']


In [70]:
#new embeddings
new_embeddings = np.array([ip2vec_model.wv[ip] for ip in modified_ips if ip in ip2vec_model.wv])

if new_embeddings.shape[0] > 0:
    new_embeddings = new_embeddings / np.linalg.norm(new_embeddings, axis=1, keepdims=True)

    pid = os.getpid()
    update_indices = np.arange(len(valid_ips))  
    index, embeddings = measure(pid, update_faiss, index, update_indices, modified_ips, embeddings)

    print(f"Updated embeddings in FAISS index.")

Function: update_faiss | Time: 0.1885s | CPU: 95.50% | Mem: 139.01MB
Updated embeddings in FAISS index.
