In [17]:
import pandas as pd

In [18]:
# Read the file line by line
with open('data/test_log1.out', 'r') as f:
    lines = f.readlines()

# Create a DataFrame
df = pd.DataFrame(lines, columns=['Text'])

# Display the first 5 rows of the DataFrame
print(df.head())

                                                Text
0  Nov 09 13:11:35 localhost kernel: Linux versio...
1  Nov 09 13:11:35 localhost kernel: Command line...
2  Nov 09 13:11:35 localhost kernel: x86/fpu: Sup...
3  Nov 09 13:11:35 localhost kernel: x86/fpu: Sup...
4  Nov 09 13:11:35 localhost kernel: x86/fpu: Sup...


# Extract the inherent structure of the data into a dataframe

In [19]:
# Split the 'Text' column at the 5th space
df = df['Text'].str.split(' ', n=5, expand=True)

## Timestamp

In [20]:
df['timestamp'] = df[[0, 1, 2]].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df.drop(columns=[0, 1, 2], inplace=True)

## Device

In [21]:
df['device'] = df[3]
df.drop(columns=[3], inplace=True)

## Program

In [22]:
df['program'] = df.loc[:, 4].str.replace(r'(\[.*\])?:', '', regex=True)
df.drop(columns=[4], inplace=True)

## Renaming and datatype of timestamp

In [23]:
df['log'] = df[5]
df.drop(columns=[5], inplace=True)
df.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], format='%b %d %H:%M:%S', errors='coerce')

## Further preprocessing

In [24]:
df.loc[:, 'log'] = df.loc[:, 'log'].str.lower()

In [25]:
df.loc[:, 'timestamp'] = pd.to_datetime(df['timestamp'], format='%b %d %H:%M:%S', errors='coerce')

## Set up backend handler funcs

In [28]:
# Update the year of every timestamp to 2023
df.loc[:, 'timestamp'] = df['timestamp'].apply(lambda x: x.replace(year=2023))

In [31]:
# Select all rows within a time range
df.loc[(df['timestamp'] > pd.to_datetime('2023-11-09 13:11:00')) & (df['timestamp'] < pd.to_datetime('2023-11-09 13:11:50'))]

Unnamed: 0,timestamp,device,program,log
0,2023-11-09 13:11:35,localhost,kernel,linux version 5.15.73 (oe-user@oe-host) (x86_6...
1,2023-11-09 13:11:35,localhost,kernel,command line: boot_image=/boot/vmlinuz root=/d...
2,2023-11-09 13:11:35,localhost,kernel,x86/fpu: supporting xsave feature 0x001: 'x87 ...
3,2023-11-09 13:11:35,localhost,kernel,x86/fpu: supporting xsave feature 0x002: 'sse ...
4,2023-11-09 13:11:35,localhost,kernel,x86/fpu: supporting xsave feature 0x004: 'avx ...
...,...,...,...,...
2940,2023-11-09 13:11:49,CMX50070-101776,kernel,cmx_au_ioctl_set_clear_sig_fpga call\n
2941,2023-11-09 13:11:49,CMX50070-101776,kernel,cmx_ioctl_get_slot_id call\n
2942,2023-11-09 13:11:49,CMX50070-101776,kernel,cmx_au_ioctl_dma_dl_fpga call\n
2943,2023-11-09 13:11:49,CMX50070-101776,xu_launcher,front unit ld has been started.\n


In [32]:
df['timestamp']

0        2023-11-09 13:11:35
1        2023-11-09 13:11:35
2        2023-11-09 13:11:35
3        2023-11-09 13:11:35
4        2023-11-09 13:11:35
                ...         
29290    2023-11-10 05:49:15
29291    2023-11-10 05:49:15
29292    2023-11-10 05:49:15
29293    2023-11-10 05:49:15
29294    2023-11-10 05:49:15
Name: timestamp, Length: 29295, dtype: object