In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Preprocessing

In [67]:
import pandas as pd
import html
import re

In [68]:
mails = pd.read_csv('/content/drive/MyDrive/kluza/all_mails.csv', header=0)
keywords = pd.read_csv('/content/drive/MyDrive/kluza/keywords.csv', header=0)
metalabel = pd.read_csv('/content/drive/MyDrive/kluza/metalabel.csv', header=0)

In [69]:
def clear_text(text):
  cleared_text = html.unescape(text)
  cleared_text = cleared_text.split('\n>')[0]
  cleared_text = cleared_text.replace('\n', ' ')
  return cleared_text
mails['content'] = mails['content'].apply(clear_text)

In [70]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [71]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Remove unnecessary characters, symbols, and punctuation
    cleaned_text = re.sub(r'[^\w\s]', '', text)

    cleaned_text = cleaned_text.lower()

    # Remove words starting with 'https'
    cleaned_text = re.sub(r'\bhttps\w+\b', '', cleaned_text)

    # Remove stopwords
    tokens = cleaned_text.split()
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    return ' '.join(lemmatized_tokens)

mails['content'] = mails['content'].apply(preprocess_text)

# Observation

In [60]:
mails['content'][100]

'work camel11108 started luca burgazzoli'

In [61]:
mails['content'][101]

'asf github bot commented camel11108 github user lburgazzoli opened pull request wip camel11108 camelinfinispan change uri syntax infinispanhostname infinispancachename tristantarrant may check missed something infinispan side merge pull request git repository running git pull camel11108 alternatively review apply change patch close pull request make commit mastertrunk branch least following commit message close 1604 commit 14fa5e5156e3e0e0e76cfdd23427a06febc27714 author lburgazzoli lburgazzoligmailcom date 20170406t131154z camel11108 camelinfinispan change uri syntax infinispanhostname infinispancachename'

In [62]:
mails['content_url'][101]

'http://mail-archives.apache.org/mod_mbox/camel-issues/201704.mbox/ajax/%3CJIRA.13061654.1491378117000.225309.1491493301861%40Atlassian.JIRA%3E'

# Event log extraction

In [79]:
import spacy
from spacy.matcher import Matcher
import re
nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

In [80]:
event_patterns = [
    [{"LOWER": "resolve"}],
    [{"LOWER": "close"}],
    [{"LOWER": "fix"}],
    [{"LOWER": "create"}],
    [{"LOWER": "comment"}],
    [{"LOWER": "pull request"}],
    [{"LOWER": "push request"}],
    [{"LOWER": "merge"}],
    [{"LOWER": "commit"}],
    [{"LOWER": "customize"}],
    [{"LOWER": "start"}],
    [{"LOWER": "apply change"}],
    [{"LOWER": "update"}],
    [{"LOWER": "review"}],
    [{"LOWER": "add feature"}],
    [{"LOWER": "implement"}],
    # Add more patterns as needed
]

def extract_event(doc):
    for pattern in event_patterns:
        matcher.add("Event", [pattern])
    matches = matcher(doc)

    events = []
    for match_id, start, end in matches:
        events.append(doc[start:end].text)

    return events if events else None

def extract_object(doc, event):
    for token in doc:
        if token.lower_ == event:
            for child in token.children:
                if child.dep_ in ['dobj', 'attr'] and child.pos_ == 'NOUN':
                    subtree_tokens = [t.text for t in child.subtree]
                    for entity in doc.ents:
                        if entity.text in subtree_tokens:
                            return entity.text
    return None



def extract_actor(doc):
    actors = []
    for entity in doc.ents:
        if entity.label_ == "PERSON":
            actors.append(entity.text)

    if actors:
        return actors[0]

    return None


In [81]:
ocel_entries = []

for i in range(10):
    doc = nlp(mails['content'][i])
    events = extract_event(doc)

    if events:
        for event in events:
            event_entry = {}
            event_entry['Event'] = event
            event_entry['Object'] = extract_object(doc, event)
            event_entry['Actor'] = extract_actor(doc)
            event_entry['Timestamp'] = mails['date'][i]

            ocel_entries.append(event_entry)

In [82]:
ocel_entries

[{'Event': 'create',
  'Object': None,
  'Actor': 'daniel fullarton',
  'Timestamp': 'Sun, 02 Apr, 23:59'},
 {'Event': 'resolve',
  'Object': None,
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Event': 'merge',
  'Object': None,
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Event': 'review',
  'Object': None,
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Event': 'close',
  'Object': None,
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Event': 'commit',
  'Object': '34edd2cd3edc83b10c4c9bae0518fcf8ca144735',
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Event': 'commit',
  'Object': '34edd2cd3edc83b10c4c9bae0518fcf8ca144735',
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Event': 'close',
  'Object': None,
  'Actor': 'asf github',
  'Timestamp': 'Mon, 03 Apr, 00:01'},
 {'Event': 'commit',
  'Object': '34edd2cd3edc83b10c4c9bae0518fcf8ca144735',
  'Actor': 'asf github',
  

# More Observations

In [25]:
mails['content_url'][1]

'http://mail-archives.apache.org/mod_mbox/camel-issues/201704.mbox/ajax/%3CJIRA.13061008.1491177558000.187519.1491177687915%40Atlassian.JIRA%3E'

In [50]:
mails['content'][1]

'asf github bot commented camel11099 github user linead opened pull request camel11099 resolve issue caused unresolve element hanging tâ merge pull request git repository running git pull soapfaulterrorhandling alternatively review apply change patch close pull request make commit mastertrunk branch least following commit message close 1589 commit 34edd2cd3edc83b10c4c9bae0518fcf8ca144735 author daniel fullarton danielfullartonnabcomau date 20170402t233213z camel11099 resolve issue caused unresolve element hanging faul detail'

In [26]:
mails.head()

Unnamed: 0.1,Unnamed: 0,id,author,subject,date,content_url,dtype,content
0,0,1,Daniel Fullarton (JIRA),[jira] [Created] (CAMEL-11099) Unhandled Cla...,"Sun, 02 Apr, 23:59",http://mail-archives.apache.org/mod_mbox/camel...,issues,daniel fullarton created camel11099 summary un...
1,1,2,ASF GitHub Bot (JIRA),[jira] [Commented] (CAMEL-11099) Unhandled C...,"Mon, 03 Apr, 00:01",http://mail-archives.apache.org/mod_mbox/camel...,issues,asf github bot commented camel11099github user...
2,2,3,Zoran Regvart (JIRA),[jira] [Resolved] (CAMEL-11099) Unhandled Cl...,"Tue, 04 Apr, 13:33",http://mail-archives.apache.org/mod_mbox/camel...,issues,regvart resolved camel11099 resolution fixed a...
3,3,4,ASF GitHub Bot (JIRA),[jira] [Commented] (CAMEL-11099) Unhandled C...,"Wed, 05 Apr, 01:38",http://mail-archives.apache.org/mod_mbox/camel...,issues,asf github bot commented camel11099github user...
4,4,5,,[jira] [Commented] (CAMEL-11092) If setting Ex...,,http://mail-archives.apache.org/mod_mbox/camel...,issues,asf github bot commented camel11099github user...
