In [215]:
# Import relevant packages
import sklearn
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [216]:
# read the data set

dataset = []
fns = []
chunks = []
for line in open('dataset-example.csv'):
    fn, txt, chunk = line.strip().split('|')
    fns.append(fn)
    dataset.append(txt)
    chunks.append(chunk)

In [253]:
# Perform tfidf vecotrization

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(dataset)
tf = TfidfVectorizer(norm=None)
tfidf_matrix = tf.fit_transform(dataset)

In [254]:
# Feature score for a document
def tfidfscores(doc):
    feature_names = np.array(tf.get_feature_names())
    feature_index = tfidf_matrix[doc,:].nonzero()[1]
    tfidf_scores = list(zip(feature_names[feature_index], [tfidf_matrix[doc, x] for x in feature_index]))
    return sorted(tfidf_scores, reverse=True, key=lambda x:x[1])

In [255]:
# sum of feature scores for multiple documents
def tfidfsum(docs, tf, tfidf_matrix):
    feature_names = np.array(tf.get_feature_names())
    feature_sum = np.zeros(tfidf_matrix[0,:].shape)
    for doc in docs:
        feature_sum += tfidf_matrix[doc,:]
    feature_index = feature_sum.nonzero()[1]
    tfidf_scores = list(zip(feature_names[feature_index], [float(feature_sum[:,x]) for x in feature_index]))
    return sorted(tfidf_scores, reverse=True, key=lambda x:x[1])

In [256]:
def chunk_tfidf_sum(chunk, tf, tfidf_matrix):
    docs = []
    for i,c in enumerate(chunks):
        if chunks[i] == c:
            docs.append(i)
    return tfidfsum(docs,tf,tfidf_matrix)

In [257]:
chunk_tfidf_sum('1546405141',tf,tfidf_matrix)

[('file', 10643.446503155095),
 ('data', 10593.0),
 ('raw', 9095.0),
 ('hash', 3459.0),
 ('unsigned', 3187.436367554707),
 ('ntfs', 3114.5274718571304),
 ('filename', 3106.986243111714),
 ('timestamps', 3106.986243111714),
 ('dll', 2343.6105493788805),
 ('manifest', 2308.6996671969123),
 ('cat', 1706.1780635007015),
 ('signed', 1593.7181837773535),
 ('octet', 1568.7508563567767),
 ('stream', 1568.7508563567767),
 ('picture', 1492.5468556612684),
 ('15063', 1424.8742422227142),
 ('modifiedon', 1419.0),
 ('accessedon', 1416.8280009426321),
 ('createdon', 1416.8280009426321),
 ('changedon', 1413.6376451695442),
 ('exif', 1382.4409400796994),
 ('windows', 1262.8466050903323),
 ('size', 1154.0),
 ('entropy', 1153.0),
 ('filetype', 1153.0),
 ('md5', 1153.0),
 ('mimeclass', 1153.0),
 ('mimetype', 1153.0),
 ('sha1', 1153.0),
 ('sha256', 1153.0),
 ('application', 1145.1694194501172),
 ('xml', 1079.5672774640743),
 ('nsrl', 1075.0),
 ('10', 1061.9124969405414),
 ('31bf3856ad364e35', 1054.9838784

In [266]:
chunktxt = dict()
chunkdocs = dict()
revchunknrs = dict()
for i,c in enumerate(chunks):
    chunktxt[c] = chunktxt.get(c,'') + ' ' + dataset[i]
    l = chunkdocs.get(c)
    if l:
        l.append(i)
    else:
        chunkdocs[c] = [i]
        
chunknrs = list(chunktxt.keys())
chunkdataset = list(chunktxt.values())
revchunknrs = dict([(v,k) for k,v in enumerate(chunknrs)] )

# Perform tfidf vecotrization
chunktf = TfidfVectorizer(norm=None)
chunktfidf_matrix = chunktf.fit_transform(chunkdataset)

In [267]:
# Feature score for a document
def tfidfscores(doc,tf, tfidf_matrix):
    feature_names = np.array(tf.get_feature_names())
    feature_index = tfidf_matrix[doc,:].nonzero()[1]
    tfidf_scores = list(zip(feature_names[feature_index], [tfidf_matrix[doc, x] for x in feature_index]))
    return sorted(tfidf_scores, reverse=True, key=lambda x:x[1])

In [268]:
tfidfscores(revchunknrs['1546405141'],chunktf, chunktfidf_matrix)

[('file', 254.3333500040248),
 ('raw', 252.0),
 ('data', 252.0),
 ('14928', 249.68014962102845),
 ('manifest', 247.45645252064634),
 ('hash', 84.0),
 ('xml', 77.44619926883591),
 ('lost', 67.46895231967346),
 ('files', 66.94557259533319),
 ('18', 66.43179671262368),
 ('stream', 58.57183718087957),
 ('octet', 58.57183718087957),
 ('text', 44.517853043397245),
 ('14917', 29.961617954523412),
 ('extension', 29.456593173024597),
 ('other', 28.87318551235024),
 ('path', 28.25926111155831),
 ('owner', 28.25926111155831),
 ('name', 28.25926111155831),
 ('entryid', 28.25926111155831),
 ('createdon', 28.25926111155831),
 ('changedon', 28.25926111155831),
 ('accessedon', 28.25926111155831),
 ('application', 28.143700393656825),
 ('modifiedon', 28.0),
 ('size', 28.0),
 ('mimetype', 28.0),
 ('mimeclass', 28.0),
 ('lkpdb_kp', 28.0),
 ('hveicse', 28.0),
 ('nsrl', 28.0),
 ('hashmisses', 28.0),
 ('sha256', 28.0),
 ('sha1', 28.0),
 ('md5', 28.0),
 ('filetype', 28.0),
 ('entropy', 28.0),
 ('694', 21.125

In [237]:
chunknrs[100]

'1546322188'

In [238]:
chunkdocs['1546322188']

[753, 754]

In [239]:
chunkdocs['1546405141']

[112,
 113,
 116,
 122,
 130,
 133,
 175,
 201,
 213,
 227,
 236,
 255,
 285,
 302,
 331,
 340,
 341,
 376,
 400,
 417,
 423,
 429,
 436,
 452,
 457,
 462,
 478,
 479]

In [244]:
revchunknrs = dict([(v,k) for k,v in enumerate(chunknrs)] )

In [251]:
revchunknrs['1546405141']

48

In [247]:
chunknrs[48]

'1546405141'