In [24]:
import pandas as pd
import numpy as np
import re 


In [25]:
def extract_log_info_from_file(file_path):
    # Regular expression pattern to extract information
    pattern = r'(\w{3})\s+(\d{1,2})\s(\d{2}:\d{2}:\d{2})\s(\w+)\s(\w+)(\(\w+\))?(\[\d+\])?:(.*)'
    
    # Read log content from the file
    with open(file_path, 'r') as file:
        log_content = file.read()
    
    # Find all matches in the log content
    matches = re.findall(pattern, log_content)
    
    # Create a list to store extracted data
    data = []
    
    # Iterate over matches and append to data list
    for match in matches:
        Month = match[0]
        Day = match[1]
        Time = match[2]
        hostname = match[3]
        service = match[4]
        pam_unix = match[5] if match[5] else ''  # Handle optional part
        process_id = match[6] if match[6] else ''
        message = match[7]

        # replace () with '' in pam_unix
        pam_unix = pam_unix.replace('(', '').replace(')', '') if pam_unix else ''

        # replace [] with '' in process_id
        process_id = process_id.replace('[', '').replace(']', '') if process_id else ''
        data.append([Month, Day, Time, hostname, service, pam_unix, process_id, message])
    
    # Create DataFrame from the data list
    df = pd.DataFrame(data, columns=['Month', 'Day', 'Time', 'Hostname', 'Service', 'PAM_UNIX', 'Process_Id', 'Message'])
    
    return df

# Extract log information from the file
log_df = extract_log_info_from_file(file_path)
log_df = log_df[['Message']]
log_df.head()

Unnamed: 0,Message
0,syslogd startup succeeded
1,klogd startup succeeded
2,"klogd 1.4.1, log source = /proc/kmsg started."
3,Linux version 2.6.5-1.358 (bhcompile@bugs.bui...
4,BIOS-provided physical RAM map:


In [26]:
# calculate the length of the message words
log_df['Message_length'] = log_df['Message'].apply(lambda x: len(x.split()))
log_df.head()

Unnamed: 0,Message,Message_length
0,syslogd startup succeeded,3
1,klogd startup succeeded,3
2,"klogd 1.4.1, log source = /proc/kmsg started.",7
3,Linux version 2.6.5-1.358 (bhcompile@bugs.bui...,19
4,BIOS-provided physical RAM map:,4


In [27]:
# calculate the number of alphabetical words in the message
log_df['Alphabetical_words'] = log_df['Message'].apply(lambda x: sum(1 for word in x.split() if word.isalpha()))
log_df.head()

Unnamed: 0,Message,Message_length,Alphabetical_words
0,syslogd startup succeeded,3,3
1,klogd startup succeeded,3,3
2,"klogd 1.4.1, log source = /proc/kmsg started.",7,3
3,Linux version 2.6.5-1.358 (bhcompile@bugs.bui...,19,8
4,BIOS-provided physical RAM map:,4,2


In [28]:
# calculate the number of numerical words in the message
log_df['Numerical_words'] = log_df['Message'].apply(lambda x: sum(1 for word in x.split() if word.isnumeric()))
log_df.head()

Unnamed: 0,Message,Message_length,Alphabetical_words,Numerical_words
0,syslogd startup succeeded,3,3,0
1,klogd startup succeeded,3,3,0
2,"klogd 1.4.1, log source = /proc/kmsg started.",7,3,0
3,Linux version 2.6.5-1.358 (bhcompile@bugs.bui...,19,8,3
4,BIOS-provided physical RAM map:,4,2,0


In [29]:
# calculate the number of alphanumeric words in the message
log_df['Alphanumeric_words'] = log_df['Message'].apply(lambda x: sum(1 for word in x.split() if word.isalnum()))
log_df.head()

Unnamed: 0,Message,Message_length,Alphabetical_words,Numerical_words,Alphanumeric_words
0,syslogd startup succeeded,3,3,0,3
1,klogd startup succeeded,3,3,0,3
2,"klogd 1.4.1, log source = /proc/kmsg started.",7,3,0,3
3,Linux version 2.6.5-1.358 (bhcompile@bugs.bui...,19,8,3,11
4,BIOS-provided physical RAM map:,4,2,0,2


In [30]:
# calculate the number of special characters in the message
log_df['Special_words'] = log_df['Message'].apply(lambda x: sum(1 for word in x.split() if not word.isalnum()))
log_df.head()

Unnamed: 0,Message,Message_length,Alphabetical_words,Numerical_words,Alphanumeric_words,Special_words
0,syslogd startup succeeded,3,3,0,3,0
1,klogd startup succeeded,3,3,0,3,0
2,"klogd 1.4.1, log source = /proc/kmsg started.",7,3,0,3,4
3,Linux version 2.6.5-1.358 (bhcompile@bugs.bui...,19,8,3,11,8
4,BIOS-provided physical RAM map:,4,2,0,2,2


In [31]:
def compare_words(chaine1, chaine2):
    mots1 = chaine1.split()
    mots2 = chaine2.split()
    
    if len(mots1) != len(mots2):
        return False
    
    for mot1, mot2 in zip(mots1, mots2):
        if mot1 != mot2:
            return False
    
    return True

In [34]:
# group the messages by all other columns and count the number of occurences
grouped_df = log_df.groupby([ 'Message_length', 'Alphabetical_words', 'Numerical_words', 'Alphanumeric_words', 'Special_words']).count().reset_index()
grouped_df = grouped_df.rename(columns={'Message': 'Occurences'})
grouped_df.head()

Unnamed: 0,Message_length,Alphabetical_words,Numerical_words,Alphanumeric_words,Special_words,Occurences
0,0,0,0,0,0,93
1,1,0,0,0,1,18
2,1,1,0,1,0,34
3,2,0,0,0,2,1104
4,2,1,0,1,1,113


In [35]:
## group the messages by all other columns 
grouped_df = log_df.groupby([ 'Message_length', 'Alphabetical_words', 'Numerical_words', 'Alphanumeric_words', 'Special_words'])
grouped_df.head()

Unnamed: 0,Message,Message_length,Alphabetical_words,Numerical_words,Alphanumeric_words,Special_words
0,syslogd startup succeeded,3,3,0,3,0
1,klogd startup succeeded,3,3,0,3,0
2,"klogd 1.4.1, log source = /proc/kmsg started.",7,3,0,3,4
3,Linux version 2.6.5-1.358 (bhcompile@bugs.bui...,19,8,3,11,8
4,BIOS-provided physical RAM map:,4,2,0,2,2
...,...,...,...,...,...,...
23826,audit(1138296165.211:606989): avc: denied {...,12,3,0,3,9
23831,audit(1138296165.211:606989): item=1 inode=11...,4,0,0,0,4
23910,audit(1138296212.406:734909): avc: denied {...,15,2,0,2,13
23969,audit(1138296225.596:965626): avc: denied {...,13,3,0,3,10


In [37]:
# cluster the messages using kmeans
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(log_df[['Message_length', 'Alphabetical_words', 'Numerical_words', 'Alphanumeric_words', 'Special_words']])
X

# Fit KMeans and predict the clusters
kmeans = KMeans(n_clusters=19, random_state=0).fit(X)
log_df['Cluster'] = kmeans.labels_
log_df.head()




  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,Message,Message_length,Alphabetical_words,Numerical_words,Alphanumeric_words,Special_words,Cluster
0,syslogd startup succeeded,3,3,0,3,0,2
1,klogd startup succeeded,3,3,0,3,0,2
2,"klogd 1.4.1, log source = /proc/kmsg started.",7,3,0,3,4,15
3,Linux version 2.6.5-1.358 (bhcompile@bugs.bui...,19,8,3,11,8,13
4,BIOS-provided physical RAM map:,4,2,0,2,2,5


In [39]:
# print cluster 10
log_df[log_df['Cluster'] == 11].head()

Unnamed: 0,Message,Message_length,Alphabetical_words,Numerical_words,Alphanumeric_words,Special_words,Cluster
19,You can enable it with acpi=force,6,5,0,5,1,11
52,POSIX conformance testing by UNIFIX,5,5,0,5,0,11
166,Disabled Privacy Extensions on device 022db72...,6,5,0,5,1,11
233,session closed for user htt,5,5,0,5,0,11
339,session closed for user cyrus,5,5,0,5,0,11
