 # 1 WordSegment Installation

In [3]:
!pip install wordsegment

Collecting wordsegment
  Using cached wordsegment-1.3.1-py2.py3-none-any.whl (4.8 MB)
Installing collected packages: wordsegment
Successfully installed wordsegment-1.3.1


# 2 Co-ocurrence Matrix

In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict

def co_occurrence(sentences, window_size):
    d = defaultdict(int)
    vocab = set()
    for text in sentences:
        # preprocessing (use tokenizer instead)
        text = text.lower().split()
        # iterate over sentences
        for i in range(len(text)):
            token = text[i]
            vocab.add(token)  # add to vocab
            next_token = text[i+1 : i+1+window_size]
            for t in next_token:
                key = tuple( sorted([t, token]) )
                d[key] += 1
    
    # formulate the dictionary into dataframe
    vocab = sorted(vocab) # sort vocab
    df = pd.DataFrame(data=np.zeros((len(vocab), len(vocab)), dtype=np.int16),
                      index=vocab,
                      columns=vocab)
    for key, value in d.items():
        df.at[key[0], key[1]] = value
        df.at[key[1], key[0]] = value
    return df

# 3 Read Textfile

In [19]:
from wordsegment import load, segment

def read_testfile():
    with open("./dga_testing.txt") as f:
        domains = f.readlines()
  
    segmentation = []
    
    load()

    # you may also want to remove whitespace characters like `\n` at the end of each line
    # also remove TLDs and annoying hyphens in AGDs
    domains = [x.strip().split(".")[0].replace('-','') for x in domains]
    for domain in domains:
        segmented = ' '.join([str(word) for word in segment(domain)])
        segmentation.append(segmented)
      
    return segmentation

# 4 Testing

In [27]:
text = read_testfile()
df = co_occurrence(text, 2)
df

Unnamed: 0,a,afraid,airplane,alan,albertson,alexandrina,already,although,always,anger,...,winter,with,within,woman,women,wonder,would,wright,wrote,your
a,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
afraid,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
airplane,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
alan,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
albertson,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
wonder,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
would,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
wright,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
wrote,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 5 Extra steps

In [32]:
# Select columns with value greater than 5
filter = (df > 5).any()
sub_df = df.loc[: , filter]
sub_df = sub_df.loc[filter,: ]
sub_df

Unnamed: 0,a,bella,bot,calan,christian,dulci,e,huddles,in,lrs,maria,on,re,sever,the,tn,to,wright
a,0,0,1,1,7,0,0,1,0,0,0,1,0,6,1,0,0,1
bella,0,0,2,2,0,7,0,2,0,0,8,2,0,0,2,0,0,2
bot,1,2,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,6
calan,1,2,0,0,1,1,0,0,0,0,1,0,0,0,13,0,0,0
christian,7,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
dulci,0,7,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
e,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0
huddles,1,2,0,0,1,1,0,0,0,0,1,6,0,0,0,0,0,0
in,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0
lrs,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12,0,0
