In [19]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [20]:
import datetime

In [21]:
dataset = pd.read_csv('weblog.csv')
dataset.head()

Unnamed: 0,IP,Time,URL,Status
0,10.128.2.1,[29/Nov/2017:06:58:55,GET /login.php HTTP/1.1 ...,200
1,10.128.2.1,[29/Nov/2017:06:59:02,POST /process.php HTTP/1.1 ...,302
2,10.128.2.1,[29/Nov/2017:06:59:03,GET /home.php HTTP/1.1 ...,200
3,10.131.2.1,[29/Nov/2017:06:59:04,GET /js/vendor/moment.min.js HTTP/1.1 ...,200
4,10.130.2.1,[29/Nov/2017:06:59:06,GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1 ...,200


In [22]:
dataset.iloc[0:1,0]

0    10.128.2.1 
Name: IP         , dtype: object

In [23]:
df = dataset.iloc[:580,:]
df.head()

Unnamed: 0,IP,Time,URL,Status
0,10.128.2.1,[29/Nov/2017:06:58:55,GET /login.php HTTP/1.1 ...,200
1,10.128.2.1,[29/Nov/2017:06:59:02,POST /process.php HTTP/1.1 ...,302
2,10.128.2.1,[29/Nov/2017:06:59:03,GET /home.php HTTP/1.1 ...,200
3,10.131.2.1,[29/Nov/2017:06:59:04,GET /js/vendor/moment.min.js HTTP/1.1 ...,200
4,10.130.2.1,[29/Nov/2017:06:59:06,GET /bootstrap-3.3.7/js/bootstrap.js HTTP/1.1 ...,200


In [24]:
data = {'IP_Address':['10.128.2.1'], 'Time_Difference_Mean':[99], 'Time_Difference_Variance':[54], 'Time_Difference_Sum':[105], 'Time_Difference_Maximum':[15], 'Character-bigrams':[45], 'Character-trigrams':[34], 'Character-ngrams':[74], 'Count_of_most_visited_page':[14], 'Status':[200], 'Number_of_records':[40]}
work_dataset = pd.DataFrame(data)
work_dataset

Unnamed: 0,IP_Address,Time_Difference_Mean,Time_Difference_Variance,Time_Difference_Sum,Time_Difference_Maximum,Character-bigrams,Character-trigrams,Character-ngrams,Count_of_most_visited_page,Status,Number_of_records
0,10.128.2.1,99,54,105,15,45,34,74,14,200,40


In [25]:
def time_stats(dataframe):
    """
    Calculate time-based statistics from weblog time format [DD/MMM/YYYY:HH:MM:SS

    Args:
        dataframe: pandas DataFrame with time values in format [DD/MMM/YYYY:HH:MM:SS
    Returns:
        tuple: (max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff)
    """
    if len(dataframe) <= 1:
        return 0, 0, 0, 0

    # Extract time from the format [DD/MMM/YYYY:HH:MM:SS
    def extract_time(time_str):
        # Remove leading [ and get the time part after :
        time_part = time_str.split(":", 1)[1].split(" ")[0]
        return datetime.datetime.strptime(time_part, "%H:%M:%S")

    # Process all times at once
    try:
        processed_times = [
            extract_time(str(time)) for time in dataframe.iloc[:, 1].values
        ]

        # Calculate time differences in seconds
        time_diffs = [
            (processed_times[i] - processed_times[i - 1]).seconds
            for i in range(1, len(processed_times))
        ]

        if not time_diffs:
            return 0, 0, 0, 0

        # Calculate statistics
        max_time_diff = max(time_diffs)
        sum_time_diff = sum(time_diffs)
        mean_time_diff = sum_time_diff / len(time_diffs)

        # Calculate variance
        variance_time_diff = sum(
            (diff - mean_time_diff) ** 2 for diff in time_diffs
        ) / len(time_diffs)

        return max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff

    except (ValueError, IndexError) as e:
        print(f"Error processing times: {e}")
        return 0, 0, 0, 0

In [26]:
time_stats(df)

(23466, 95.58376511226253, 55343, 1112702.1497191428)

In [27]:
import re
import string
import nltk
import nltk.corpus
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import bigrams

In [28]:
def most_frequent(List): 
    return max(set(List), key = List.count)

In [29]:
def bigram_stats(dataframe):
    url_array = dataframe.iloc[:,2].values
    full_string = ""
    for url in url_array:
        full_string = full_string + url
    full_string = full_string.lower()
    full_string = re.sub(r'\d+', '', full_string)
    table = str.maketrans({key: None for key in string.punctuation})
    full_string = full_string.translate(table) 
    full_string = full_string.strip()
    full_string_tokens = word_tokenize(full_string)
    for char in full_string_tokens:
        if char == 'get':
            full_string_tokens.remove('get')
    for char in full_string_tokens:
        if char == 'httpget':
            full_string_tokens.remove('httpget')
    for char in full_string_tokens:
        if char == 'httppost':
            full_string_tokens.remove('httppost')
    for char in full_string_tokens:
        if char == 'post':
            full_string_tokens.remove('post')
    
    count_most_visited_page = 0
    if len(full_string_tokens)!=0:
        most_visited_page = most_frequent(full_string_tokens)
        count_most_visited_page = full_string_tokens.count(most_visited_page)
        
    
    count_most_appearing_bigram = 0
    bigrams = list(nltk.bigrams(full_string_tokens))
    if len(bigrams)!=0:
        most_appearing_bigram = most_frequent(bigrams)
        count_most_appearing_bigram = bigrams.count(most_appearing_bigram)
    
    count_most_appearing_trigram = 0
    trigrams = list(nltk.trigrams(full_string_tokens))
    if len(trigrams)!=0:
        most_appearing_trigram = most_frequent(trigrams)
        count_most_appearing_trigram = trigrams.count(most_appearing_trigram)
    
    count_most_appearing_ngram = 0
    ngrams = list(nltk.ngrams(full_string_tokens, 6))
    if len(ngrams)!=0:
        most_appearing_ngram = most_frequent(ngrams)
        count_most_appearing_ngram = ngrams.count(most_appearing_ngram)
        
    return count_most_visited_page, count_most_appearing_bigram, count_most_appearing_trigram, count_most_appearing_ngram

In [30]:
bigram_stats(df)

(580, 79, 79, 17)

In [31]:
from collections import Counter

In [32]:
def most_visited_ip(dataframe):
    ip = dataframe.iloc[:,0].values
    x = Counter(ip)
    return x.most_common(1)[0][0]

In [33]:
most_visited_ip(df)

'10.131.2.1 '

In [34]:
def most_freq_status(dataframe):
    status = dataframe.iloc[:,3].values
    x = Counter(status)
    return x.most_common(1)[0][0]

In [35]:
most_freq_status(df)

200

In [36]:
startindex = 0
endindex = 0
prevdate = ""
p = 1
for i,row in dataset.iterrows():
    
    date = row['Time'][1:12]
    if i==0:
        prevdate = date
    else:
        if date == prevdate:
            endindex = endindex + 1
        else:
            data = dataset.iloc[startindex:endindex+1, :]
            max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff = time_stats(data)
            #print(max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff)
            count_most_visited_page, count_most_appearing_bigram, count_most_appearing_trigram, count_most_appearing_ngram = bigram_stats(data)
            #print(count_most_visited_page, count_most_appearing_bigram, count_most_appearing_trigram, count_most_appearing_ngram)
            most_vis_ip = most_visited_ip(data)
            most_frequent_status = most_freq_status(data)
            #print(most_vis_ip, most_frequent_status, end = ' ')
            #print()
            if most_vis_ip == None:
                most_vis_ip = '10.130.2.1'
            if most_frequent_status == None:
                most_frequent_status = 200
            work_dataset.loc[p]=[count_most_appearing_bigram, count_most_appearing_ngram, count_most_appearing_trigram, count_most_visited_page, most_vis_ip, endindex - startindex + 1, most_frequent_status, max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff]
            startindex = endindex
            #endindex = endindex + 1
            prevdate = date
            p = p + 1
            #print(max_time_diff, mean_time_diff, sum_time_diff, variance_time_diff)

KeyError: 'Time'

In [None]:
work_dataset

In [None]:
Ip_rep = np.array([])
for i,row in work_dataset.iterrows():
    ip = row[4]
    ip = ip.replace('.', '')
    a = int(ip)
    Ip_rep = np.append(Ip_rep, a)
Ip_rep
work_dataset['IP_rep'] = Ip_rep

In [None]:
work_dataset

In [None]:
work_dataset.drop('IP_Address', axis=1, inplace=True)

In [None]:
work_dataset

In [None]:
X = work_dataset.iloc[:,:].values

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X = sc_X.fit_transform(X)
X

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 5,random_state = 0)

In [None]:
kmeans.fit(X)

In [None]:
y_kmeans = kmeans.predict(X)
y_kmeans

In [None]:
plt.scatter(X[y_kmeans == 0, 4], X[y_kmeans == 0, 3], c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 4], X[y_kmeans == 1, 3], c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 4], X[y_kmeans == 2, 3], c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 4], X[y_kmeans == 3, 3], c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 4], X[y_kmeans == 4, 3], c = 'magenta', label = 'Cluster 5')
#plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('K_means_Clustering')
plt.xlabel('Number_of_records')
plt.ylabel('Count_of_most_visited_pages')
plt.legend()
plt.show()

NameError: name 'X' is not defined

In [None]:
import seaborn as sns

In [None]:
cor = work_dataset.corr()
sns.heatmap(cor, square = True)
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering, AffinityPropagation 

In [None]:
model = AgglomerativeClustering(n_clusters=5, affinity = 'euclidean', linkage = 'ward')
clust_labels1 = model.fit_predict(X)
clust_labels1

In [None]:
agglomerative = pd.DataFrame(clust_labels1)

In [None]:
plt.scatter(X[clust_labels1 == 0, 4], X[clust_labels1 == 0, 3], c = 'red', label = 'Cluster 1')
plt.scatter(X[clust_labels1 == 1, 4], X[clust_labels1 == 1, 3], c = 'blue', label = 'Cluster 2')
plt.scatter(X[clust_labels1 == 2, 4], X[clust_labels1 == 2, 3], c = 'green', label = 'Cluster 3')
plt.scatter(X[clust_labels1 == 3, 4], X[clust_labels1 == 3, 3], c = 'cyan', label = 'Cluster 4')
plt.scatter(X[clust_labels1 == 4, 4], X[clust_labels1 == 4, 3], c = 'magenta', label = 'Cluster 5')
plt.title('Agglomerative Clustering')
plt.xlabel('Number_of_records')
plt.ylabel('Count_of_most_visited_pages')
plt.legend()
plt.show()

In [None]:
model_affinity = AffinityPropagation(damping = 0.5, max_iter = 250, affinity = 'euclidean')
model_affinity.fit(X)
clust_labels2 = model_affinity.predict(X)
cent2 = model_affinity.cluster_centers_
affinity = pd.DataFrame(clust_labels2)

In [None]:
plt.scatter(X[clust_labels2 == 0, 4], X[clust_labels2 == 0, 3], c = 'red', label = 'Cluster 1')
plt.scatter(X[clust_labels2 == 1, 4], X[clust_labels2 == 1, 3], c = 'blue', label = 'Cluster 2')
plt.scatter(X[clust_labels2 == 2, 4], X[clust_labels2 == 2, 3], c = 'green', label = 'Cluster 3')
plt.scatter(X[clust_labels2 == 3, 4], X[clust_labels2 == 3, 3], c = 'cyan', label = 'Cluster 4')
plt.scatter(X[clust_labels2 == 4, 4], X[clust_labels2 == 4, 3], c = 'magenta', label = 'Cluster 5')
plt.title('Affinity Propagation Clustering')
plt.xlabel('Number_of_records')
plt.ylabel('Count_of_most_visited_pages')
plt.legend()
plt.show()

In [None]:
from sklearn.cluster import SpectralClustering

In [None]:
clustering = SpectralClustering(n_clusters=5, assign_labels="discretize", random_state=0).fit(X)
clust_labels3 = clustering.labels_

In [None]:
clust_labels3

In [None]:
plt.scatter(X[clust_labels3 == 0, 4], X[clust_labels3 == 0, 3], c = 'red', label = 'Cluster 1')
plt.scatter(X[clust_labels3 == 1, 4], X[clust_labels3 == 1, 3], c = 'blue', label = 'Cluster 2')
plt.scatter(X[clust_labels3 == 2, 4], X[clust_labels3 == 2, 3], c = 'green', label = 'Cluster 3')
plt.scatter(X[clust_labels3 == 3, 4], X[clust_labels3 == 3, 3], c = 'cyan', label = 'Cluster 4')
plt.scatter(X[clust_labels3 == 4, 4], X[clust_labels3 == 4, 3], c = 'magenta', label = 'Cluster 5')
plt.title('Spectral Clustering')
plt.xlabel('Number_of_records')
plt.ylabel('Count_of_most_visited_pages')
plt.legend()
plt.show()