In [327]:
# Import relevant packages
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [329]:
# read the data set

dataset = []
fns = []
chunks = []
for line in open('dataset-example.csv', encoding='utf-8'):
    fn, txt, chunk = line.strip().split('|')
    fns.append(fn)
    dataset.append(txt)
    chunks.append(chunk)

In [330]:
# Perform tfidf vecotrization

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset)
tf = TfidfVectorizer(norm=None)
tfidf_matrix = tf.fit_transform(dataset)

In [331]:
# Feature score for a document
def tfidfscores(doc):
    feature_names = np.array(tf.get_feature_names())
    feature_index = tfidf_matrix[doc,:].nonzero()[1]
    tfidf_scores = list(zip(feature_names[feature_index], [tfidf_matrix[doc, x] for x in feature_index]))
    return sorted(tfidf_scores, reverse=True, key=lambda x:x[1])

In [374]:
# sum of feature scores for multiple documents
def tfidfsum(docs, tf, tfidf_matrix):
    feature_names = np.array(tf.get_feature_names())
    feature_sum = np.zeros(tfidf_matrix[0,:].shape)
    for doc in docs:
        feature_sum += 1.0/len(docs) *tfidf_matrix[doc,:]
    
    mindist, mindoc = 1e9,-1
    for doc in docs:
        dist = np.linalg.norm(tfidf_matrix[doc,:] - feature_sum)
        if dist < mindist:
            mindist = dist
            mindoc = doc
        
    feature_index = feature_sum.nonzero()[1]
    tfidf_scores = list(zip(feature_names[feature_index], [float(feature_sum[:,x]) for x in feature_index]))
    return sorted(tfidf_scores, reverse=True, key=lambda x:x[1]), fns[mindoc]

In [375]:
def chunk_tfidf_sum(chunk, tf, tfidf_matrix):
    docs = []
    for i,c in enumerate(chunks):
        if chunks[i] == chunk:
            docs.append(i)
    return tfidfsum(docs,tf,tfidf_matrix)

In [376]:
chunk_tfidf_sum('1546453200',tf,tfidf_matrix)

([('1025147', 15.338628807492807),
  ('file', 13.001481650332128),
  ('filename', 12.482726004388498),
  ('timestamps', 12.482726004388498),
  ('ntfs', 12.481775998446496),
  ('data', 10.999999999999996),
  ('flash', 10.887314066739807),
  ('raw', 9.0),
  ('flashutil_activex', 7.4149398803498645),
  ('exe', 7.228721000959564),
  ('dll', 6.2006491911796795),
  ('executable', 6.03850337032318),
  ('ocx', 5.913876909203672),
  ('vch', 5.913876909203672),
  ('unsigned', 4.538033291244043),
  ('windows_31bf3856ad364e35_10', 4.53400207748759),
  ('dotnet', 4.4506128754977485),
  ('net', 4.448430063775185),
  ('assembly', 4.418480042117481),
  ('microsoft', 4.087833413662127),
  ('activex', 3.942584606135781),
  ('macromed', 3.7074699401749323),
  ('hash', 3.5999999999999996),
  ('for', 3.56758304374361),
  ('1831038044', 3.0532468129084323),
  ('1853292631', 3.0532468129084323),
  ('2271478464', 3.0532468129084323),
  ('3418522649', 3.0532468129084323),
  ('956008885', 3.0532468129084323),
 

In [None]:
def labelvecchunks():
    for i in range(len(chunknrs)):
        res, fn = chunk_tfidf_sum(chunknrs[i],tf, tfidf_matrix)
        print (chunknrs[i]+"|"+fn+"|", ",".join([x for x,y in res][:10]))

labelvecchunks()

fns[34976]

1546309200|../data/export/15/46\30\92\00\1546309232-cc20147a-fa86-4c59-bff4-5d13fc63ae5b_0-2-0-16-5e-38e0-1| decoded,ins,1168795,295,install,utf,data,file,filename,timestamps
1546309500|../data/export/15/46\30\95\00\1546309679-cc20147a-fa86-4c59-bff4-5d13fc63ae5b_0-2-0-16-1f-2e| decoded,inf,utf,data,file,filename,timestamps,ntfs,text,16
1546310400|../data/export/15/46\31\04\00\1546310515-cc20147a-fa86-4c59-bff4-5d13fc63ae5b_0-2-0-16-5e-1025-1| decoded,ins,1167512,295,install,utf,data,file,filename,timestamps
1546315800|../data/export/15/46\31\58\00\1546316071-cc20147a-fa86-4c59-bff4-5d13fc63ae5b_0-2-0-16-50-208| dll,file,filename,timestamps,ntfs,18658,data,raw,windows,dotnet
1546316100|../data/export/15/46\31\61\00\1546316253-cc20147a-fa86-4c59-bff4-5d13fc63ae5b_0-2-0-16-50-80d| dll,file,filename,timestamps,ntfs,data,raw,dotnet,net,assembly
1546316400|../data/export/15/46\31\64\00\1546316647-cc20147a-fa86-4c59-bff4-5d13fc63ae5b_0-2-0-16-50-5a9| dll,file,filename,timestamps,ntfs,data,ra

In [354]:
chunktxt = dict()
chunkdocs = dict()
revchunknrs = dict()
for i,c in enumerate(chunks):
    chunktxt[c] = chunktxt.get(c,'') + ' ' + dataset[i]
    l = chunkdocs.get(c)
    if l:
        l.append(i)
    else:
        chunkdocs[c] = [i]
        
chunknrs = list(chunktxt.keys())
chunkdataset = list(chunktxt.values())
revchunknrs = dict([(v,k) for k,v in enumerate(chunknrs)] )

# Perform tfidf vecotrization
chunktf = TfidfVectorizer(norm=None)
chunktfidf_matrix = chunktf.fit_transform(chunkdataset)

In [338]:
# Feature score for a document
def tfidfscores(doc,tf, tfidf_matrix):
    feature_names = np.array(tf.get_feature_names())
    feature_index = tfidf_matrix[doc,:].nonzero()[1]
    tfidf_scores = list(zip(feature_names[feature_index], [tfidf_matrix[doc, x] for x in feature_index]))
    return sorted(tfidf_scores, reverse=True, key=lambda x:x[1])

In [340]:
[x for x,y in tfidfscores(revchunknrs['1546404900'],chunktf, chunktfidf_matrix)][:10]

['file',
 'data',
 'raw',
 'manifest',
 'hash',
 '15063',
 'xml',
 'lost',
 'files',
 '18']

In [341]:
def labelchunks():
    for i in range(len(chunknrs)):
        print (chunknrs[i], [x for x,y in tfidfscores(i,chunktf, chunktfidf_matrix)][:10])

labelchunks()

1546309200 ['decoded', 'data', 'file', 'ins', 'text', 'raw', 'utf', '1168795', 'install', '295']
1546309500 ['decoded', 'data', 'inf', 'file', 'text', 'raw', 'utf', 'hash', 'filename', 'ntfs']
1546310400 ['decoded', 'data', 'file', 'ins', 'text', 'raw', 'utf', '1167512', 'install', '295']
1546315800 ['file', 'raw', 'data', 'dll', 'filename', 'ntfs', 'timestamps', '18658', 'hash', 'windows']
1546316100 ['file', 'raw', 'data', 'dll', 'filename', 'ntfs', 'timestamps', 'hash', 'createdon', 'changedon']
1546316400 ['file', 'raw', 'data', 'dll', 'filename', 'ntfs', 'timestamps', 'hash', 'createdon', 'changedon']
1546316700 ['file', 'raw', 'data', '1161202', 'edgehtml', 'dll', '18765', 'filename', 'ntfs', 'timestamps']
1546317600 ['file', 'raw', 'data', 'dll', 'filename', 'ntfs', 'timestamps', 'hash', 'createdon', 'changedon']
1546317900 ['file', 'mui', 'raw', 'data', '1160011', 'bootmgr', '18731', 'exe', 'filename', 'ntfs']
1546318200 ['file', 'raw', 'data', '1159817', '18716', 'bootmgr', 'f

In [319]:
chunknrs


['1546309200"',
 '1546309500"',
 '1546310400"',
 '1546315800"',
 '1546316100"',
 '1546316400"',
 '1546316700"',
 '1546317600"',
 '1546317900"',
 '1546318200"',
 '1546318500"',
 '1546318800"',
 '1546319400"',
 '1546319700"',
 '1546320300"',
 '1546320600"',
 '1546320900"',
 '1546321200"',
 '1546321500"',
 '1546322100"',
 '1546322400"',
 '1546324200"',
 '1546324500"',
 '1546324800"',
 '1546347300"',
 '1546347600"',
 '1546347900"',
 '1546357200"',
 '1546374600"',
 '1546404900"',
 '1546414500"',
 '1546420500"',
 '1546420800"',
 '1546422600"',
 '1546422900"',
 '1546423200"',
 '1546424100"',
 '1546424400"',
 '1546424700"',
 '1546425000"',
 '1546453200"',
 '1546461600"',
 '1546465800"',
 '1546466100"',
 '1546466400"',
 '1546603800"',
 '1546605900"',
 '1546867800"',
 '1546873200"',
 '1546884600"',
 '1546943400"',
 '1546950900"',
 '1546954800"',
 '1546986300"']