In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
import datetime

In [None]:
dataset = pd.read_csv('../dataset/access_log.txt/output.csv')
dataset.head()

In [None]:
df = dataset.iloc[:580, :]
df.head()

In [None]:
data = {
    "IP_Address": ["10.128.2.1"],
    "Time_Difference_Mean": [99],
    "Time_Difference_Variance": [54],
    "Time_Difference_Sum": [105],
    "Time_Difference_Maximum": [15],
    "Character-bigrams": [45],
    "Character-trigrams": [34],
    "Character-ngrams": [74],
    "Count_of_most_visited_page": [14],
    "Status": [200],
    "Number_of_records": [40],
}
work_dataset = pd.DataFrame(data)
work_dataset

In [None]:
import datetime

# import pandas as pd


def time_stats(dataframe):
    """
    Calculate time-based statistics from weblog time format [DD-MM-YYYY HH:MM

    Args:   
        dataframe: pandas DataFrame with time values in format [DD-MM-YYYY HH:MM]
    Returns:
        tuple: (max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff)
    """
    if len(dataframe) <= 1:
        return 0, 0, 0, 0

    # Extract time from the format [DD-MM-YYYY HH:MM]
    def extract_time(time_str):
        return datetime.datetime.strptime(time_str, "%d-%m-%Y %H:%M")

    # Process all times at once
    try:
        processed_times = [
            extract_time(str(time)) for time in dataframe.iloc[:, 1].values
        ]

        # Calculate time differences in seconds
        time_diffs = [
            (processed_times[i] - processed_times[i - 1]).total_seconds()
            for i in range(1, len(processed_times))
        ]

        if not time_diffs:
            return 0, 0, 0, 0

        # Calculate statistics
        max_time_diff = max(time_diffs)
        sum_time_diff = sum(time_diffs)
        mean_time_diff = sum_time_diff / len(time_diffs)

        # Calculate variance
        variance_time_diff = sum(
            (diff - mean_time_diff) ** 2 for diff in time_diffs
        ) / len(time_diffs)

        return max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff

    except (ValueError, IndexError) as e:
        print(f"Error processing times: {e}")
        return 0, 0, 0, 0

In [None]:
time_stats(df)

In [None]:
import re
import string
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import bigrams

In [None]:
def most_frequent(List):
    return max(set(List), key=List.count)

In [None]:
def bigram_stats(dataframe):
    url_array = dataframe.iloc[:, 2].values
    full_string = ""
    for url in url_array:
        full_string = full_string + url
    full_string = full_string.lower()
    full_string = re.sub(r"\d+", "", full_string)
    table = str.maketrans({key: None for key in string.punctuation})
    full_string = full_string.translate(table)
    full_string = full_string.strip()
    full_string_tokens = word_tokenize(full_string)
    for char in full_string_tokens:
        if char == "get":
            full_string_tokens.remove("get")
    for char in full_string_tokens:
        if char == "httpget":
            full_string_tokens.remove("httpget")
    for char in full_string_tokens:
        if char == "httppost":
            full_string_tokens.remove("httppost")
    for char in full_string_tokens:
        if char == "post":
            full_string_tokens.remove("post")

    count_most_visited_page = 0
    if len(full_string_tokens) != 0:
        most_visited_page = most_frequent(full_string_tokens)
        count_most_visited_page = full_string_tokens.count(most_visited_page)

    count_most_appearing_bigram = 0
    bigrams = list(nltk.bigrams(full_string_tokens))
    if len(bigrams) != 0:
        most_appearing_bigram = most_frequent(bigrams)
        count_most_appearing_bigram = bigrams.count(most_appearing_bigram)

    count_most_appearing_trigram = 0
    trigrams = list(nltk.trigrams(full_string_tokens))
    if len(trigrams) != 0:
        most_appearing_trigram = most_frequent(trigrams)
        count_most_appearing_trigram = trigrams.count(most_appearing_trigram)

    count_most_appearing_ngram = 0
    ngrams = list(nltk.ngrams(full_string_tokens, 6))
    if len(ngrams) != 0:
        most_appearing_ngram = most_frequent(ngrams)
        count_most_appearing_ngram = ngrams.count(most_appearing_ngram)

    return (
        count_most_visited_page,
        count_most_appearing_bigram,
        count_most_appearing_trigram,
        count_most_appearing_ngram,
    )

In [None]:
bigram_stats(df)

In [None]:
from collections import Counter

In [None]:
def most_visited_ip(dataframe):
    ip = dataframe.iloc[:, 0].values
    x = Counter(ip)
    return x.most_common(1)[0][0]

In [None]:
most_visited_ip(df)

In [None]:
def most_freq_status(dataframe):
    status = dataframe.iloc[:, 3].values
    x = Counter(status)
    return x.most_common(1)[0][0]

In [None]:
most_freq_status(df)

In [None]:
print(dataset.columns)

In [None]:
# Process dataset
startindex = 0
endindex = 0
prevdate = ""
p = 1

work_dataset = pd.DataFrame(
    columns=[
        "Character-bigrams",
        "Character-ngrams",
        "Character-trigrams",
        "Count_of_most_visited_page",
        "IP_Address",
        "Number_of_records",
        "Status",
        "Time_Difference_Maximum",
        "Time_Difference_Mean",
        "Time_Difference_Sum",
        "Time_Difference_Variance",
    ]
)

for i, row in dataset.iterrows():
    date = row["Timestamp"][:10]  # Extract the date part from the Timestamp
    if i == 0:
        prevdate = date
    else:
        if date == prevdate:
            endindex += 1
        else:
            data = dataset.iloc[startindex : endindex + 1, :]
            max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff = time_stats(data)
            count_most_visited_page, count_most_appearing_bigram, count_most_appearing_trigram, count_most_appearing_ngram = bigram_stats(data)
            most_vis_ip = most_visited_ip(data)
            most_frequent_status = most_freq_status(data)
            most_vis_ip = most_vis_ip if most_vis_ip else "10.130.2.1"
            most_frequent_status = most_frequent_status if most_frequent_status else 200

            work_dataset.loc[p] = [
                count_most_appearing_bigram,
                count_most_appearing_ngram,
                count_most_appearing_trigram,
                count_most_visited_page,
                most_vis_ip,
                endindex - startindex + 1,
                most_frequent_status,
                max_time_diff,
                mean_time_diff,
                sum_time_diff,
                variance_time_diff,
            ]
            startindex = endindex + 1
            endindex = startindex
            prevdate = date
            p += 1

# Process the last date range
data = dataset.iloc[startindex : endindex + 1, :]
max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff = time_stats(data)
count_most_visited_page, count_most_appearing_bigram, count_most_appearing_trigram, count_most_appearing_ngram = bigram_stats(data)
most_vis_ip = most_visited_ip(data)
most_frequent_status = most_freq_status(data)
most_vis_ip = most_vis_ip if most_vis_ip else "10.130.2.1"
most_frequent_status = most_frequent_status if most_frequent_status else 200

work_dataset.loc[p] = [
    count_most_appearing_bigram,
    count_most_appearing_ngram,
    count_most_appearing_trigram,
    count_most_visited_page,
    most_vis_ip,
    endindex - startindex + 1,
    most_frequent_status,
    max_time_diff,
    mean_time_diff,
    sum_time_diff,
    variance_time_diff,
]

In [None]:
work_dataset

In [None]:
Ip_rep = []
for i, row in work_dataset.iterrows():
    ip = str(row.iloc[4])  # Convert to string
    if ip == "nan":  # Handle missing values
        ip = "0.0.0.0"  # Assign a default IP like "0.0.0.0"
    ip = ip.replace(".", "")  # Remove dots
    a = int(ip)  # Convert to integer
    Ip_rep.append(a)  # Append to list

# Convert list to NumPy array and add to DataFrame
work_dataset["IP_rep"] = np.array(Ip_rep)

In [None]:
work_dataset

In [None]:
work_dataset.drop("IP_Address", axis=1, inplace=True)

In [None]:
work_dataset

In [None]:
X = work_dataset.iloc[:, :].values

In [None]:
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X = sc_X.fit_transform(X)
X

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=0)

In [None]:
kmeans.fit(X)

In [None]:
y_kmeans = kmeans.predict(X)
y_kmeans

In [None]:
plt.scatter(X[y_kmeans == 0, 4], X[y_kmeans == 0, 3], c="red", label="Cluster 1")
plt.scatter(X[y_kmeans == 1, 4], X[y_kmeans == 1, 3], c="blue", label="Cluster 2")
plt.scatter(X[y_kmeans == 2, 4], X[y_kmeans == 2, 3], c="green", label="Cluster 3")
plt.scatter(X[y_kmeans == 3, 4], X[y_kmeans == 3, 3], c="cyan", label="Cluster 4")
plt.scatter(X[y_kmeans == 4, 4], X[y_kmeans == 4, 3], c="magenta", label="Cluster 5")
# plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title("K_means_Clustering")
plt.xlabel("Number_of_records")
plt.ylabel("Count_of_most_visited_pages")
plt.legend()
plt.show()

In [None]:
import seaborn as sns

In [None]:
cor = work_dataset.corr()
sns.heatmap(cor, square=True)
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering, AffinityPropagation

In [None]:
model = AgglomerativeClustering(n_clusters=5, metric="euclidean", linkage="ward")
clust_labels1 = model.fit_predict(X)
clust_labels1

In [None]:
agglomerative = pd.DataFrame(clust_labels1)

In [None]:
plt.scatter(
    X[clust_labels1 == 0, 4], X[clust_labels1 == 0, 3], c="red", label="Cluster 1"
)
plt.scatter(
    X[clust_labels1 == 1, 4], X[clust_labels1 == 1, 3], c="blue", label="Cluster 2"
)
plt.scatter(
    X[clust_labels1 == 2, 4], X[clust_labels1 == 2, 3], c="green", label="Cluster 3"
)
plt.scatter(
    X[clust_labels1 == 3, 4], X[clust_labels1 == 3, 3], c="cyan", label="Cluster 4"
)
plt.scatter(
    X[clust_labels1 == 4, 4], X[clust_labels1 == 4, 3], c="magenta", label="Cluster 5"
)
plt.title("Agglomerative Clustering")
plt.xlabel("Number_of_records")
plt.ylabel("Count_of_most_visited_pages")
plt.legend()
plt.show()

In [None]:
model_affinity = AffinityPropagation(damping=0.5, max_iter=250, affinity="euclidean")
model_affinity.fit(X)
clust_labels2 = model_affinity.predict(X)
cent2 = model_affinity.cluster_centers_
affinity = pd.DataFrame(clust_labels2)

In [None]:
plt.scatter(
    X[clust_labels2 == 0, 4], X[clust_labels2 == 0, 3], c="red", label="Cluster 1"
)
plt.scatter(
    X[clust_labels2 == 1, 4], X[clust_labels2 == 1, 3], c="blue", label="Cluster 2"
)
plt.scatter(
    X[clust_labels2 == 2, 4], X[clust_labels2 == 2, 3], c="green", label="Cluster 3"
)
plt.scatter(
    X[clust_labels2 == 3, 4], X[clust_labels2 == 3, 3], c="cyan", label="Cluster 4"
)
plt.scatter(
    X[clust_labels2 == 4, 4], X[clust_labels2 == 4, 3], c="magenta", label="Cluster 5"
)
plt.title("Affinity Propagation Clustering")
plt.xlabel("Number_of_records")
plt.ylabel("Count_of_most_visited_pages")
plt.legend()
plt.show()

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
clustering = SpectralClustering(
    n_clusters=5, assign_labels="discretize", random_state=0
).fit(X)
clust_labels3 = clustering.labels_

In [None]:
clust_labels3

In [None]:
plt.scatter(
    X[clust_labels3 == 0, 4], X[clust_labels3 == 0, 3], c="red", label="Cluster 1"
)
plt.scatter(
    X[clust_labels3 == 1, 4], X[clust_labels3 == 1, 3], c="blue", label="Cluster 2"
)
plt.scatter(
    X[clust_labels3 == 2, 4], X[clust_labels3 == 2, 3], c="green", label="Cluster 3"
)
plt.scatter(
    X[clust_labels3 == 3, 4], X[clust_labels3 == 3, 3], c="cyan", label="Cluster 4"
)
plt.scatter(
    X[clust_labels3 == 4, 4], X[clust_labels3 == 4, 3], c="magenta", label="Cluster 5"
)
plt.title("Spectral Clustering")
plt.xlabel("Number_of_records")
plt.ylabel("Count_of_most_visited_pages")
plt.legend()
plt.show()