In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preprocessing

In [7]:
import pandas as pd
import html

In [8]:
mails = pd.read_csv('/content/drive/MyDrive/kluza/all_mails.csv', header=0)
keywords = pd.read_csv('/content/drive/MyDrive/kluza/keywords.csv', header=0)
metalabel = pd.read_csv('/content/drive/MyDrive/kluza/metalabel.csv', header=0)

In [9]:
def clear_text(text):
  cleared_text = html.unescape(text)
  cleared_text = cleared_text.split('\n>')[0]
  cleared_text = cleared_text.replace('\n', '')
  return cleared_text
mails['content'] = mails['content'].apply(clear_text)

# First Model

In [12]:
import spacy
from spacy.matcher import Matcher
import re
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

In [34]:
# Define your own patterns for event extraction
event_patterns = [
    [{"LOWER": "resolved"}],
    [{"LOWER": "closed"}],
    [{"LOWER": "fixed"}],
    [{"LOWER": "created"}],
    # Add more patterns as needed
]

def extract_event(doc):
    # Apply pattern matching
    for pattern in event_patterns:
        matcher.add("Event", [pattern])  # Wrap pattern in an additional list
    matches = matcher(doc)

    # Extract event if a match is found
    if matches:
        match_id, start, end = matches[0]
        return doc[start:end].text

    return None

object_pattern = r"CAMEL-\d{5}"

def extract_object(doc):
    # Extract the object from the email content using regular expressions
    matches = re.findall(object_pattern, doc.text)
    if matches:
        return matches[0]
    return None

def extract_actor(doc):
    # Extract the actor using named entity recognition
    actors = []
    for entity in doc.ents:
        if entity.label_ == "PERSON":
            actors.append(entity.text)

    # If multiple actors are extracted, choose the most relevant one based on your criteria
    if actors:
        return actors[0]

    return None


In [35]:
ocel_entries = []

# Iterate over the emails
for i in range(10):
    doc = nlp(mails['content'][i])
    event_entry = {}

    # Extract the event and object information using keywords/pattern matching
    event_entry['Event'] = extract_event(doc)
    event_entry['Object'] = extract_object(doc)

    # Extract the actor using named entity recognition
    event_entry['Actor'] = extract_actor(doc)

    # Extract the timestamp from the email date
    event_entry['Timestamp'] = mails['date'][i]

    # Add the OCEL entry to the list
    ocel_entries.append(event_entry)

In [36]:
ocel_entries

[{'Event': None,
  'Object': 'CAMEL-11099',
  'Actor': 'Daniel Fullarton',
  'Timestamp': 'Sun, 02 Apr, 23:59'},
 {'Event': None,
  'Object': 'CAMEL-11099',
  'Actor': 'Daniel Fullarton',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Event': 'resolved',
  'Object': 'CAMEL-11099',
  'Actor': 'https://issues.apache.org/jira/browse/CAMEL-11099?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel]Zoran Regvart',
  'Timestamp': 'Tue, 04 Apr, 13:33'},
 {'Event': 'closed',
  'Object': 'CAMEL-11099',
  'Actor': None,
  'Timestamp': 'Wed, 05 Apr, 01:38'},
 {'Event': 'closed', 'Object': 'CAMEL-11099', 'Actor': None, 'Timestamp': nan},
 {'Event': 'closed',
  'Object': 'CAMEL-11092',
  'Actor': None,
  'Timestamp': 'Mon, 03 Apr, 07:44'},
 {'Event': 'fixed',
  'Object': 'CAMEL-11092',
  'Actor': 'Claus Ibsen',
  'Timestamp': 'Wed, 12 Apr, 08:32'},
 {'Event': 'resolved',
  'Object': 'CAMEL-11092',
  'Actor': 'Regvart',
  'Timestamp': 'Wed, 12 Apr, 08:46'},
 {'Event': 'resolved',
  'Object': 

In [37]:
mails['content_url'][0]

'http://mail-archives.apache.org/mod_mbox/camel-issues/201704.mbox/ajax/%3CJIRA.13061008.1491177558000.187517.1491177581519%40Atlassian.JIRA%3E'

# Second Model

In [44]:
def extract_event(doc):
    # Apply pattern matching
    for pattern in event_patterns:
        matcher.add("Event", [pattern])  # Wrap pattern in an additional list
    matches = matcher(doc)

    # Extract event if a match is found
    if matches:
        match_id, start, end = matches[0]
        return doc[start:end].text

    return "Unknown"

object_pattern = r"CAMEL-\d{5}"

def extract_object(doc):
    # Extract the object from the email content using regular expressions
    matches = re.findall(object_pattern, doc.text)
    if matches:
        return matches[0]
    return None

def extract_actor(doc):
    # Extract the actor using named entity recognition
    actors = []
    for entity in doc.ents:
        if entity.label_ == "PERSON":
            actors.append(entity.text)

    # If multiple actors are extracted, choose the most relevant one based on your criteria
    if actors:
        actor = actors[0]
        # Split actor string if it contains additional text
        actor = re.split(r"\s|[:\[\]]", actor)[0].strip()
        return actor

    return None





In [45]:
ocel_entries = []

# Iterate over the emails
for i in range(10):
    doc = nlp(mails['content'][i])
    event_entry = {}

    # Extract the event and object information using keywords/pattern matching
    event_entry['Event'] = extract_event(doc)
    event_entry['Object'] = extract_object(doc)

    # Extract the actor using named entity recognition
    event_entry['Actor'] = extract_actor(doc)

    # Extract the timestamp from the email date
    event_entry['Timestamp'] = mails['date'][i]

    # Add the OCEL entry to the list
    ocel_entries.append(event_entry)

In [46]:
# Validate and ensure at least one event entry in the OCEL
if all(entry['Event'] == "Unknown" for entry in ocel_entries):
    ocel_entries[0]['Event'] = "Created"

# Print the OCEL entries
for entry in ocel_entries:
    print(entry)

{'Event': 'Unknown', 'Object': 'CAMEL-11099', 'Actor': 'Daniel', 'Timestamp': 'Sun, 02 Apr, 23:59'}
{'Event': 'Unknown', 'Object': 'CAMEL-11099', 'Actor': 'Daniel', 'Timestamp': 'Mon, 03 Apr, 00:01'}
{'Event': 'resolved', 'Object': 'CAMEL-11099', 'Actor': 'https', 'Timestamp': 'Tue, 04 Apr, 13:33'}
{'Event': 'closed', 'Object': 'CAMEL-11099', 'Actor': None, 'Timestamp': 'Wed, 05 Apr, 01:38'}
{'Event': 'closed', 'Object': 'CAMEL-11099', 'Actor': None, 'Timestamp': nan}
{'Event': 'closed', 'Object': 'CAMEL-11092', 'Actor': None, 'Timestamp': 'Mon, 03 Apr, 07:44'}
{'Event': 'fixed', 'Object': 'CAMEL-11092', 'Actor': 'Claus', 'Timestamp': 'Wed, 12 Apr, 08:32'}
{'Event': 'resolved', 'Object': 'CAMEL-11092', 'Actor': 'Regvart', 'Timestamp': 'Wed, 12 Apr, 08:46'}
{'Event': 'resolved', 'Object': 'CAMEL-11092', 'Actor': 'Regvart', 'Timestamp': nan}
{'Event': 'Unknown', 'Object': 'CAMEL-11019', 'Actor': None, 'Timestamp': 'Mon, 03 Apr, 07:57'}


In [42]:
mails['content_url'][7]

'http://mail-archives.apache.org/mod_mbox/camel-issues/201704.mbox/ajax/%3CJIRA.13060026.1490791958000.263623.1491986802052%40Atlassian.JIRA%3E'

# Third Model ********

In [None]:
import spacy
from spacy.matcher import Matcher
import re
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

In [48]:
# Define your own patterns for event extraction
event_patterns = [
    [{"LOWER": "resolved"}],
    [{"LOWER": "closed"}],
    [{"LOWER": "fixed"}],
    [{"LOWER": "created"}],
    # Add more patterns as needed
]

def extract_event(doc):
    # Apply pattern matching
    for pattern in event_patterns:
        matcher.add("Event", [pattern])  # Wrap pattern in an additional list
    matches = matcher(doc)

    # Extract events if matches are found
    events = []
    for match_id, start, end in matches:
        events.append(doc[start:end].text)

    return events if events else None


object_pattern = r"CAMEL-\d{5}"

def extract_object(doc):
    # Extract the object from the email content using regular expressions
    matches = re.findall(object_pattern, doc.text)
    if matches:
        return matches[0]
    return None

def extract_actor(doc):
    # Extract the actor using named entity recognition
    actors = []
    for entity in doc.ents:
        if entity.label_ == "PERSON":
            actors.append(entity.text)

    # If multiple actors are extracted, choose the most relevant one based on your criteria
    if actors:
        return actors[0]

    return None


In [49]:
ocel_entries = []

# Iterate over the emails
for i in range(10):
    doc = nlp(mails['content'][i])
    events = extract_event(doc)

    if events:
        for event in events:
            event_entry = {}
            event_entry['Event'] = event
            event_entry['Object'] = extract_object(doc)
            event_entry['Actor'] = extract_actor(doc)
            event_entry['Timestamp'] = mails['date'][i]

            ocel_entries.append(event_entry)

In [50]:
ocel_entries

[{'Event': 'created',
  'Object': 'CAMEL-11099',
  'Actor': 'Daniel Fullarton',
  'Timestamp': 'Sun, 02 Apr, 23:59'},
 {'Event': 'resolved',
  'Object': 'CAMEL-11099',
  'Actor': 'https://issues.apache.org/jira/browse/CAMEL-11099?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel]Zoran Regvart',
  'Timestamp': 'Tue, 04 Apr, 13:33'},
 {'Event': 'Fixed',
  'Object': 'CAMEL-11099',
  'Actor': 'https://issues.apache.org/jira/browse/CAMEL-11099?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel]Zoran Regvart',
  'Timestamp': 'Tue, 04 Apr, 13:33'},
 {'Event': 'closed',
  'Object': 'CAMEL-11099',
  'Actor': None,
  'Timestamp': 'Wed, 05 Apr, 01:38'},
 {'Event': 'closed', 'Object': 'CAMEL-11099', 'Actor': None, 'Timestamp': nan},
 {'Event': 'closed',
  'Object': 'CAMEL-11092',
  'Actor': None,
  'Timestamp': 'Mon, 03 Apr, 07:44'},
 {'Event': 'fixed',
  'Object': 'CAMEL-11092',
  'Actor': 'Claus Ibsen',
  'Timestamp': 'Wed, 12 Apr, 08:32'},
 {'Event': 'resolved',
 

In [None]:
mails['content_url'][0]

'http://mail-archives.apache.org/mod_mbox/camel-issues/201704.mbox/ajax/%3CJIRA.13061008.1491177558000.187517.1491177581519%40Atlassian.JIRA%3E'