In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preprocessing

In [None]:
import pandas as pd
import html
import re

In [None]:
mails = pd.read_csv('/content/drive/MyDrive/kluza/all_mails.csv', header=0)

Clearing HTML snippets and unnecessary characters from the content.

In [None]:
def clear_text(text):
  cleared_text = html.unescape(text)
  cleared_text = cleared_text.split('\n>')[0]
  cleared_text = cleared_text.replace('\n', ' ')
  return cleared_text
mails['content'] = mails['content'].apply(clear_text)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Removing more unnecessary characters, symbols, and punctiations. Removing words starting with 'https' since they are left in the content after the removing. Removing stopwords and finally lemmatizing.

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove unnecessary characters, symbols, and punctuation
    cleaned_text = re.sub(r'[^\w\s]', '', text)

    cleaned_text = cleaned_text.lower()

    # Remove words starting with 'https'
    cleaned_text = re.sub(r'\bhttps\w+\b', '', cleaned_text)

    # Remove stopwords
    tokens = cleaned_text.split()
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return ' '.join(lemmatized_tokens)

mails['content'] = mails['content'].apply(preprocess_text)

# Observation

In [None]:
mails['content'][100]

'work camel11108 started luca burgazzoli'

In [None]:
mails['content'][101]

'asf github bot commented camel11108 github user lburgazzoli opened pull request wip camel11108 camelinfinispan change uri syntax infinispanhostname infinispancachename tristantarrant may check missed something infinispan side merge pull request git repository running git pull camel11108 alternatively review apply change patch close pull request make commit mastertrunk branch least following commit message close 1604 commit 14fa5e5156e3e0e0e76cfdd23427a06febc27714 author lburgazzoli lburgazzoligmailcom date 20170406t131154z camel11108 camelinfinispan change uri syntax infinispanhostname infinispancachename'

In [None]:
mails['content_url'][101]

'http://mail-archives.apache.org/mod_mbox/camel-issues/201704.mbox/ajax/%3CJIRA.13061654.1491378117000.225309.1491493301861%40Atlassian.JIRA%3E'

# Event log extraction

In [None]:
import spacy
from spacy.matcher import Matcher
import re
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

We use natural language processing to figure out the most repeated events/activities on the mail content by extracting the verbs. (Tokenization and lemmatization are repeated here, it is possible to skip that part.)

In [None]:
body_text = mails['content']
noun_phrases = {}
verbs = {}
no_ph = []
ve = []

for text in body_text:
    doc = nlp(text)

    no_ph = [chunk.text for chunk in doc.noun_chunks]
    ve = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    for i in range(len(ve)):
      if(ve[i] in verbs):
        verbs[ve[i]] += 1
      else:
        verbs[ve[i]] = 1
    ve = []

After observing the verbs we have decided to set the repetition bar to 31. Remaining part of the verbs seems less necessary.

In [None]:
sorted_verbs = dict(sorted(verbs.items(), key=lambda item: item[1], reverse=True))
sorted_verbs = {k: v for k, v in sorted_verbs.items() if v >= 31}

In [None]:
sorted_verbs

{'comment': 330,
 'fix': 218,
 'create': 206,
 'send': 200,
 'update': 200,
 'resolve': 183,
 'use': 177,
 'commit': 134,
 'make': 111,
 'pull': 107,
 'add': 102,
 'close': 94,
 'run': 91,
 'take': 81,
 'need': 80,
 'get': 77,
 'follow': 76,
 'fail': 67,
 'set': 64,
 'allow': 61,
 'affect': 59,
 'open': 56,
 'apply': 56,
 'patch': 56,
 'see': 52,
 'find': 51,
 'merge': 50,
 'seem': 50,
 'reassign': 49,
 'provide': 49,
 'work': 45,
 'think': 44,
 'generate': 37,
 'try': 36,
 'warn': 36,
 'implement': 32,
 'start': 31,
 'request': 31,
 'do': 31}

We have created event patterns based on the most repeated verbs.

In [None]:
event_patterns = [[{"LOWER": key}] for key in sorted_verbs.keys()]

In [None]:
event_patterns

[[{'LOWER': 'comment'}],
 [{'LOWER': 'fix'}],
 [{'LOWER': 'create'}],
 [{'LOWER': 'send'}],
 [{'LOWER': 'update'}],
 [{'LOWER': 'resolve'}],
 [{'LOWER': 'use'}],
 [{'LOWER': 'commit'}],
 [{'LOWER': 'make'}],
 [{'LOWER': 'pull'}],
 [{'LOWER': 'add'}],
 [{'LOWER': 'close'}],
 [{'LOWER': 'run'}],
 [{'LOWER': 'take'}],
 [{'LOWER': 'need'}],
 [{'LOWER': 'get'}],
 [{'LOWER': 'follow'}],
 [{'LOWER': 'fail'}],
 [{'LOWER': 'set'}],
 [{'LOWER': 'allow'}],
 [{'LOWER': 'affect'}],
 [{'LOWER': 'open'}],
 [{'LOWER': 'apply'}],
 [{'LOWER': 'patch'}],
 [{'LOWER': 'see'}],
 [{'LOWER': 'find'}],
 [{'LOWER': 'merge'}],
 [{'LOWER': 'seem'}],
 [{'LOWER': 'reassign'}],
 [{'LOWER': 'provide'}],
 [{'LOWER': 'work'}],
 [{'LOWER': 'think'}],
 [{'LOWER': 'generate'}],
 [{'LOWER': 'try'}],
 [{'LOWER': 'warn'}],
 [{'LOWER': 'implement'}],
 [{'LOWER': 'start'}],
 [{'LOWER': 'request'}],
 [{'LOWER': 'do'}]]

We have created different functions for the extraction of event log elements.
* In the extract_event function we extract the action based on the event_patterns we have created previously.
* In the exract_object function we have decided to choose a context_window number to detect the possibile nouns (detected by NLP) which could be reffered by the action.
* And finally we wanted to add an extract_actor function to detect names of the actors who has taken the action with again NLP.

In [None]:
def extract_event(doc):
    for pattern in event_patterns:
        matcher.add("Event", [pattern])
    matches = matcher(doc)

    events = []
    for match_id, start, end in matches:
        events.append(doc[start:end].text)

    return events if events else None

def extract_object(doc, event):
    context_window = 2

    for i, token in enumerate(doc):
        if token.lower_ == event:
            start_index = max(0, i - context_window)
            end_index = min(len(doc), i + context_window + 1)

            for j in range(start_index, end_index):
                if doc[j].pos_ == 'NOUN':
                    return doc[j].text

    return None



def extract_actor(doc):
    actors = []
    for entity in doc.ents:
        if entity.label_ == "PERSON":
            actors.append(entity.text)

    if actors:
        return actors[0]

    return None


We create ocel_entries to store the events with their features by detecting them with NLP, and using functions we have created.

In [None]:
ocel_entries = []

for i in range(len(mails['content'])):
    doc = nlp(mails['content'][i])
    events = extract_event(doc)

    if events:
        for event in events:
            event_entry = {}
            event_entry['Id'] = i
            event_entry['Event'] = event
            event_entry['Object'] = extract_object(doc, event)
            event_entry['Actor'] = extract_actor(doc)
            event_entry['Timestamp'] = mails['date'][i]

            ocel_entries.append(event_entry)

Id: Referring to the mail id. There can be more than 1 element with the same id since there can be multiple events in a single email.

In [None]:
ocel_entries

[{'Id': 0,
  'Event': 'create',
  'Object': 'create',
  'Actor': 'daniel fullarton',
  'Timestamp': 'Sun, 02 Apr, 23:59'},
 {'Id': 1,
  'Event': 'pull',
  'Object': 'pull',
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Id': 1,
  'Event': 'request',
  'Object': 'pull',
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Id': 1,
  'Event': 'resolve',
  'Object': 'request',
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Id': 1,
  'Event': 'merge',
  'Object': 'pull',
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Id': 1,
  'Event': 'pull',
  'Object': 'pull',
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Id': 1,
  'Event': 'request',
  'Object': 'pull',
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Id': 1,
  'Event': 'pull',
  'Object': 'pull',
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Id': 1,
  'Event': 'apply',
  'Object': 'review',
  'Actor': 'asf gith

# Create OCEL

## CSV

In [None]:
import csv

ocel_columns = ['Id', 'Event', 'Object', 'Actor', 'Timestamp']

filename = 'ocel_file.csv'

with open(filename, 'w', newline='') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=ocel_columns)
    writer.writeheader()
    writer.writerows(ocel_entries)

## OCEL STANDARD

Here we use pm4py to convert our results to OCEL standard.

In [None]:
!pip install pm4py

Collecting pm4py
  Downloading pm4py-2.7.5-py3-none-any.whl (1.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.7 MB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m25.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecation (from pm4py)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting intervaltree (from pm4py)
  Downloading intervaltree-3.1.0.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting stringdist (from pm4py)
  Downloading StringDist-1.0.9.tar.gz (7.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: intervaltree, strin

In [None]:
df = pd.DataFrame(ocel_entries)

In [None]:
df.head()

Unnamed: 0,Id,Event,Object,Actor,Timestamp
0,0,create,create,daniel fullarton,"Sun, 02 Apr, 23:59"
1,1,pull,pull,asf github,"Mon, 03 Apr, 00:01"
2,1,request,pull,asf github,"Mon, 03 Apr, 00:01"
3,1,resolve,request,asf github,"Mon, 03 Apr, 00:01"
4,1,merge,pull,asf github,"Mon, 03 Apr, 00:01"


We are required a date type in the dataframe in order to change it to OCEL standard. Year is not given inside the timestamp column, but all the emails are sent in 2017, so we change it manually.

In [None]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'], format='%a, %d %b, %H:%M')
df['Timestamp'] = df['Timestamp'].apply(lambda x: x.replace(year=2017))

In [None]:
df.head()

Unnamed: 0,Id,Event,Object,Actor,Timestamp
0,0,create,create,daniel fullarton,2017-04-02 23:59:00
1,1,pull,pull,asf github,2017-04-03 00:01:00
2,1,request,pull,asf github,2017-04-03 00:01:00
3,1,resolve,request,asf github,2017-04-03 00:01:00
4,1,merge,pull,asf github,2017-04-03 00:01:00


In [None]:
mails['content_url'][100]

'http://mail-archives.apache.org/mod_mbox/camel-issues/201704.mbox/ajax/%3CJIRA.13061654.1491378117000.223267.1491479681625%40Atlassian.JIRA%3E'

We change the 'Object' column into 'ocel:oid' because it is also required for the conversion.

In [None]:
df.rename(columns={'Object': 'ocel:oid'}, inplace=True)

In [None]:
df.head()

Unnamed: 0,Id,Event,ocel:oid,Actor,Timestamp
0,0,create,create,daniel fullarton,2017-04-02 23:59:00
1,1,pull,pull,asf github,2017-04-03 00:01:00
2,1,request,pull,asf github,2017-04-03 00:01:00
3,1,resolve,request,asf github,2017-04-03 00:01:00
4,1,merge,pull,asf github,2017-04-03 00:01:00


In [None]:
import pm4py

In our case, we do not have different type for the objects. So we give all the objects same type 'ocel:oid'.

In [None]:
pm4py_ocel = pm4py.convert.convert_log_to_ocel(log = df, activity_column = 'Event', timestamp_column = 'Timestamp', object_types = ['ocel:oid'], additional_event_attributes = ['Actor'])

In [None]:
pm4py.write.write_ocel_csv(pm4py_ocel, "/content/ocel_events.csv","/content/ocel_objects.csv")