## Tabula

Installation
https://pypi.org/project/tabula-py/

pip install tabula-py

tabula-py docs
https://www.pydoc.io/pypi/tabula-py-0.9.0/autoapi/wrapper/index.html

**Read the Documentation! Useful parameters like multiple_tables, lattice, pandas_options are available**

### Emotet

In [29]:
from tabula import read_pdf

def extract_tables(pdf_filepath):
    # Specify dict as pandas option so that first row will not be treated as labels
    df = read_pdf(pdf_filepath, pages='all', multiple_tables=False, stream=True)
    return df[0]

In [30]:
df = extract_tables('Mirai1.pdf')

In [50]:
# Group by IDs?
df.sample(30)

Unnamed: 0,ID,Name,Identified Sentence
212,,,!
169,,,
142,,,
229,,,
164,,,Existing Mirai Botnet
263,,,saw a while ago on
96,,,"On the other hand, from the perspective of the..."
21,,,after on
344,,Scanning,11% scan both port
248,,,The New Variant Shares Some of the Infrastruct...


In [34]:
df.head(20)

Unnamed: 0,ID,Name,Identified Sentence
0,T1046,Network Service,Much of the new mirai variant that scans port ...
1,,Scanning,covered by various
2,,,sources.
3,T1065,Uncommonly Used,The following table lists first seen time of “...
4,,Port,“new” ones
5,,,that hit our honeypot.
6,,,
7,,,You can see the variant on port 7547 first sho...
8,,,"2016-11-26 21:27:23,"
9,,,and first observed for the variant on port 555...


In [38]:
tids = list(df['ID'])

In [41]:
type(tids)

list

In [36]:
def extract_tids(df):
    tids = list(df['ID'])    # Convert series to list to allow easier indexing
    
    indexes = []
    valid_tids = []
    for id in tids:
        # Checking i == np.nan doesn't work
        if isinstance(id, str):
            index = tids.index(id)
            indexes.append(index)
            valid_tids.append(id)
    
    # Remember to add first and last indexes (0 and 29)
    # So first batch from 0 to 7 exclusive and last batch will be from 13 to 29 exclusive
    if indexes[0] != 0:
        indexes.insert(0, 0)
    indexes.append(len(tids))    
    return valid_tids, indexes

In [42]:
valid_tids, indexes = extract_tids(df)
valid_tids, indexes

(['T1046',
  'T1065',
  'T1043',
  'T1065',
  'T1046',
  'T1043',
  'T1048',
  'T1065',
  'T1046',
  'T1008',
  'T1043',
  'T1008',
  'T1065',
  'T1046',
  'T1008',
  'T1043',
  'T1065',
  'T1046',
  'T1043',
  'T1046',
  'T1046'],
 [0,
  3,
  14,
  3,
  0,
  14,
  43,
  3,
  0,
  171,
  14,
  171,
  3,
  0,
  171,
  14,
  3,
  0,
  14,
  0,
  0,
  364])

In [43]:
def extract_names(df):
    names = list(df['Name'])
    valid_names = []
    condition = False
    for i in range(len(names)):
        if condition:
            i+=1
        if i == len(names):
            break
        # Checking i == np.nan doesn't work
        if isinstance(names[i], str):
            n = names[i]
            if isinstance(names[i+1], str):
                n = names[i] + ' ' + names[i+1]
                condition = True
            valid_names.append(n)
    return valid_names

In [44]:
valid_names = extract_names(df)
valid_names

['Network Service Scanning',
 'Uncommonly Used Port',
 'Port',
 'Commonly Used Port',
 'Uncommonly Used Port',
 'Port',
 'Network Service Scanning',
 'Scanning',
 'Commonly Used Port',
 'Exfiltration Over Alternative Protocol',
 'Alternative Protocol',
 'Uncommonly Used Port',
 'Port',
 'Network Service Scanning',
 'Scanning',
 'Fallback Channels',
 'Commonly Used Port',
 'Fallback Channels',
 'Uncommonly Used Port',
 'Port',
 'Network Service Scanning',
 'Scanning',
 'Fallback Channels',
 'Commonly Used Port',
 'Uncommonly Used Port',
 'Port',
 'Network Service Scanning',
 'Scanning',
 'Commonly Used Port',
 'Network Service Scanning',
 'Scanning Network Service',
 'Network Service Scanning',
 'Scanning']

In [47]:
sentences = df['Identified Sentence']
sentences.values

array(['Much of the new mirai variant that scans port 7547 has been',
       'covered by various', 'sources.',
       'The following table lists first seen time of “old” mirai and the',
       '“new” ones', 'that hit our honeypot.', nan,
       'You can see the variant on port 7547 first shown up on',
       '2016-11-26 21:27:23,',
       'and first observed for the variant on port 5555 was one day',
       'after on', '2016-11-27 17:04:02(all GMT +8).', nan, '!',
       'The following table lists first seen time of “old” mirai and the',
       '“new” ones', 'that hit our honeypot.', nan,
       'You can see the variant on port 7547 first shown up on',
       '2016-11-26 21:27:23,',
       'and first observed for the variant on port 5555 was one day',
       'after on', '2016-11-27 17:04:02(all GMT +8).', nan, '!',
       '[](/content/images/2016/11/03-bot-current-growth-rate-on-all-',
       'port.jpg)', nan,
       'Currently, the growth rate of the bot on port 7547 has far',
       

In [None]:
def extract_sentences(df, indexes):
    sentences = df['Identified Sentence']
    
    all_sents = []

    for i in range(0, len(indexes)): 
        # If reached last available index, break to avoid IndexError
        if i == len(indexes) - 1:
            print("NO MORE BATCHES, BREAKING")
            break

        # Define start and end indexes
        start = indexes[i]
        end = indexes[i+1]        
        print(f"STARTING ON BATCH {i+1} in range ({start}, {end})")
        
        # Deal with each subsents 1 at a time
        subsents = sentences[start:end]
#         print(subsents, '\n\n')

        batch_sents = []
        full_sent = ''

        # Range: (0,7), (7,10), (10,13), (13,29)
        for i in range(start, end):
            if isinstance(subsents[i], str):
                full_sent += ' ' + subsents[i]

            if i+1==end or isinstance(subsents[i+1], float):
                batch_sents.append(full_sent)
                full_sent = ''
            else: 
                # Do nothing if current sentence is nan
                pass

        all_sents.append(batch_sents)
        
    return all_sents

all_sents = extract_sentences(df[0], indexes)

### Implementation without lattice=True

##### Splitting

##### Handle Threat IDs

##### Handle Names

##### Handle Sentences

In [21]:
def extract_sentences(df, indexes):
    sentences = df['Identified Sentence']
    
    all_sents = []

    for i in range(0, len(indexes)): 
        # If reached last available index, break to avoid IndexError
        if i == len(indexes) - 1:
            print("NO MORE BATCHES, BREAKING")
            break

        # Define start and end indexes
        start = indexes[i]
        end = indexes[i+1]        
        print(f"STARTING ON BATCH {i+1} in range ({start}, {end})")
        
        # Deal with each subsents 1 at a time
        subsents = sentences[start:end]
#         print(subsents, '\n\n')

        batch_sents = []
        full_sent = ''

        # Range: (0,7), (7,10), (10,13), (13,29)
        for i in range(start, end):
            if isinstance(subsents[i], str):
                full_sent += ' ' + subsents[i]

            if i+1==end or isinstance(subsents[i+1], float):
                batch_sents.append(full_sent)
                full_sent = ''
            else: 
                # Do nothing if current sentence is nan
                pass

        all_sents.append(batch_sents)
        
    return all_sents

all_sents = extract_sentences(df[0], indexes)

In [23]:
count = 1
for batch in all_sents:
    print(f"BATCH {count}")
    for s in batch:
        print(s)
    print()
    count += 1

BATCH 1
 Later versions of the software saw the addition of spamming and malware delivery services—including other banking Trojans.
 Emotet uses functionality that helps the software evade detection by some anti-malware products.

BATCH 2
 The infection may arrive either via malicious script, macro- enabled document files, or malicious link.

BATCH 3
 The infection may arrive either via malicious script, macro- enabled document files, or malicious link.

BATCH 4
 This allows the attackers to install updated versions of the software, install additional malware such as other banking Trojans, or to act as a dumping ground for stolen information such as financial credentials, usernames and passwords, and email addresses.
 Latest Emotet news
 Emotet is back: botnet springs back to life with new spam campaign
 Emotet on the rise with heavy spam campaign
 Malware analysis: decoding Emotet, part 2



In [24]:
print(valid_tids, '\n')
print(valid_names, '\n')
print(all_sents)

['T1063', 'T1064', 'T1204', 'T1003'] 

['Security Software Discovery', 'Scripting', 'User Execution', 'Credential Dumping'] 

[[' Later versions of the software saw the addition of spamming and malware delivery services—including other banking Trojans.', ' Emotet uses functionality that helps the software evade detection by some anti-malware products.'], [' The infection may arrive either via malicious script, macro- enabled document files, or malicious link.'], [' The infection may arrive either via malicious script, macro- enabled document files, or malicious link.'], [' This allows the attackers to install updated versions of the software, install additional malware such as other banking Trojans, or to act as a dumping ground for stolen information such as financial credentials, usernames and passwords, and email addresses.', ' Latest Emotet news', ' Emotet is back: botnet springs back to life with new spam campaign', ' Emotet on the rise with heavy spam campaign', ' Malware analysi

for every batch, insert each sentence with name_i and id_i

i = 0: all_sents[0], name[0], id[0]
i += 1

.

.

.

#### For last page's table

In [12]:
df[1]

Unnamed: 0,ID,Name,Identified Sentence
0,,,"Malware analysis: decoding Emotet, part 1"
1,,,
2,,,How does Emotet spread?
3,,,
4,,,The primary distribution method for Emotet is ...
5,,,malspam.
6,T1192,Spearphishing Link,Since these emails are coming from your hijack...
7,,,"account, the emails"
8,,,"look less like spam and the recipients, feelin..."
9,,,inclined to


In [13]:
valid_tids1, indexes1 = extract_tids(df[1])
valid_tids1, indexes1

(['T1192'], [0, 6, 17])

In [14]:
valid_names1 = extract_names(df[1])
valid_names1

['Spearphishing Link']

In [15]:
all_sents1 = extract_sentences(df[1], indexes1)

STARTING ON BATCH 1 in range (0, 6)
STARTING ON BATCH 2 in range (6, 17)
NO MORE BATCHES, BREAKING


In [16]:
count = 1
for batch in all_sents1:
    print(f"BATCH {count}")
    for s in batch:
        print(s)
    print()
    count += 1

BATCH 1
 Malware analysis: decoding Emotet, part 1
 How does Emotet spread?
 The primary distribution method for Emotet is through malspam.

BATCH 2
 Since these emails are coming from your hijacked email account, the emails look less like spam and the recipients, feeling safe, are more inclined to click bad URLs and download infected files.
 If a connected network is present, Emotet spreads using a list of common passwords, guessing its way onto other connected systems in a brute-force attack.



In [17]:
print(valid_tids1, '\n')
print(valid_names1, '\n')
print(all_sents1)

['T1192'] 

['Spearphishing Link'] 

[[' Malware analysis: decoding Emotet, part 1', ' How does Emotet spread?', ' The primary distribution method for Emotet is through malspam.'], [' Since these emails are coming from your hijacked email account, the emails look less like spam and the recipients, feeling safe, are more inclined to click bad URLs and download infected files.', ' If a connected network is present, Emotet spreads using a list of common passwords, guessing its way onto other connected systems in a brute-force attack.']]


### Final pipeline

In [24]:
df = extract_tables('Emotet.pdf')
for d in df:
    # Ensure each table has standardised column names
    d.columns = ['ID', 'Name', 'Identified Sentence']
    valid_tids, indexes = extract_tids(d)
    valid_names = extract_names(d)
    all_sents = extract_sentences(d, indexes)

    print('\nVALID T_IDS:', valid_tids)
    print('VALID NAMES:', valid_names)
    print('SENTENCES:')
    count = 1
    for batch in all_sents:
        print(f"Batch {count}")
        for s in batch:
            print(s)
        print()
        count += 1
    print('-------------------------------------------------------------------\n')

STARTING ON BATCH 1 in range (0, 7)
STARTING ON BATCH 2 in range (7, 10)
STARTING ON BATCH 3 in range (10, 13)
STARTING ON BATCH 4 in range (13, 29)
NO MORE BATCHES, BREAKING

VALID T_IDS: ['T1063', 'T1064', 'T1204', 'T1003']
VALID NAMES: ['Security Software Discovery', 'Scripting', 'User Execution', 'Credential Dumping']
SENTENCES:
Batch 1
 Later versions of the software saw the addition of spamming and malware delivery services—including other banking Trojans.
 Emotet uses functionality that helps the software evade detection by some anti-malware products.

Batch 2
 The infection may arrive either via malicious script, macro- enabled document files, or malicious link.

Batch 3
 The infection may arrive either via malicious script, macro- enabled document files, or malicious link.

Batch 4
 This allows the attackers to install updated versions of the software, install additional malware such as other banking Trojans, or to act as a dumping ground for stolen information such as financi

In [132]:
df = extract_tables('Mirai1.pdf')
for d in df:
    # Ensure each table has standardised column names
    d.columns = ['ID', 'Name', 'Identified Sentence']
    
    # If 1st row consists of labels
    if d['ID'].iloc[0] == 'ID':
        # Remove that row permanently with inplace=True
        d.drop([0], inplace=True)
        
    valid_tids, indexes = extract_tids(d)
    valid_names = extract_names(d)
    all_sents = extract_sentences(d, indexes)

    print('\nVALID T_IDS:', valid_tids)
    print('VALID NAMES:', valid_names)
    print('SENTENCES:')
    count = 1
    for batch in all_sents:
        print(f"Batch {count}")
        for s in batch:
            print(s)
        print()
        count += 1
    print('-------------------------------------------------------------------\n')

STARTING ON BATCH 1 in range (0, 3)


KeyError: 0

In [133]:
df = extract_tables('Mirai1.pdf')
for d in df:
     # Ensure each table has standardised column names
    d.columns = ['ID', 'Name', 'Identified Sentence']
    
    # If 1st row consists of labels
    if d['ID'].iloc[0] == 'ID':
        # Remove that row permanently with inplace=True
        d.drop([0], inplace=True)
        
    print(d)

       ID                Name  \
1   T1046     Network Service   
2     NaN            Scanning   
3     NaN                 NaN   
4   T1065     Uncommonly Used   
5     NaN                Port   
6     NaN                 NaN   
7     NaN                 NaN   
8     NaN                 NaN   
9     NaN                 NaN   
10    NaN                 NaN   
11    NaN                 NaN   
12    NaN                 NaN   
13    NaN                 NaN   
14    NaN                 NaN   
15  T1043  Commonly Used Port   
16    NaN                 NaN   
17    NaN                 NaN   
18    NaN                 NaN   
19    NaN                 NaN   
20    NaN                 NaN   
21    NaN                 NaN   
22    NaN                 NaN   
23    NaN                 NaN   
24    NaN                 NaN   
25    NaN                 NaN   
26  T1065     Uncommonly Used   
27    NaN                Port   
28    NaN                 NaN   
29    NaN                 NaN   
30    NaN 

In [76]:
df[6]

Unnamed: 0,ID,Name,Identified Sentence
0,,,captured in our
1,,,honeypot that have scanned port 23/2323/5555/7...
2,,,
3,,,We can see that:
4,,,
5,,,96.4% of the Mirai Bots scan port 23 or port 2...
6,T1046,Network Service,[](/content/images/2016/11/07-two-c2-server-an...
7,,Scanning,server-in-one-
8,,,marai-sample.jpg)
9,,,


In [72]:
df[5]

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,geographical
0,,,distribution of the existing mirai botnet.
1,,,
2,,,
3,,,
4,,,We provide various statistics and data downloa...
5,,,infected devices at
6,,,http://data.netlab.360.com/mirai-scanner for r...
7,,,
8,,,For those who have been using API to access ou...
9,,,please re-download


In [66]:
# ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
if (df[1].columns != ['ID', 'Name', 'Identified Sentence']).any():
    df[1].columns = ['ID', 'Name', 'Identified Sentence']
df[1]

Unnamed: 0,ID,Name,Identified Sentence
0,,Scanning,port.jpg)
1,,,
2,,,"Currently, the growth rate of the bot on port ..."
3,,,exceeded the number
4,,,of bots on port 23/2323.
5,T1043,Commonly Used Port,[](/content/images/2016/11/03-bot-current-grow...
6,,,port.jpg)
7,,,
8,,,"Currently, the growth rate of the bot on port ..."
9,,,exceeded the number


In [11]:
valid_tids, indexes = extract_tids(df[0])


In [13]:
valid_names = extract_names(df[0])
valid_names

['Network Service Scanning',
 'Uncommonly Used Port',
 'Port',
 'Commonly Used Port',
 'Uncommonly Used Port',
 'Port']

In [30]:
df[7].columns = ['a', 'b', 'c']

In [31]:
df[7]

Unnamed: 0,a,b,c
0,,,
1,,,The following diagram shows the overlap of all...
2,,,captured in our
3,,,honeypot that have scanned port 23/2323/5555/7...
4,,,
5,,,We can see that:
6,,,
7,,,96.4% of the Mirai Bots scan port 23 or port 2...
8,T1065,Uncommonly Used,"Among them, 79% only scan port 23 and 6.4% onl..."
9,,Port,11% scan both port
