In [71]:
import ast
import pandas as pd
import os

In [2]:
# Empty dictionary to hold unique triples and their timestamps
first_auth = {}

In [3]:
# Creates a generator object to iterate through the file in reverse order
# from srohde, here: https://stackoverflow.com/questions/2301789/read-a-file-in-reverse-order-using-python

def reverse_readline(filename, buf_size=8192):
    """a generator that returns the lines of a file in reverse order"""
    with open(filename) as fh:
        segment = None
        offset = 0
        fh.seek(0, os.SEEK_END)
        file_size = remaining_size = fh.tell()
        while remaining_size > 0:
            offset = min(file_size, offset + buf_size)
            fh.seek(file_size - offset)
            buffer = fh.read(min(remaining_size, buf_size))
            remaining_size -= buf_size
            lines = buffer.split('\n')
            # the first line of the buffer is probably not a complete line so
            # we'll save it and append it to the last line of the next buffer
            # we read
            if segment is not None:
                # if the previous chunk starts right from the beginning of line
                # do not concact the segment to the last line of new chunk
                # instead, yield the segment first 
                if buffer[-1] is not '\n':
                    lines[-1] += segment
                else:
                    yield segment
            segment = lines[0]
            for index in range(len(lines) - 1, 0, -1):
                if len(lines[index]):
                    yield lines[index]
        # Don't yield None if the file was empty
        if segment is not None:
            yield segment

In [4]:
%%time
# Since the data are ordered by time, this code loops backward through auth.txt
#
# The time, source_user@domain, source_computer, and destination_computer values
# are read on each row.  A dictionary (first_auth) is populated with the results
# where the triple (source_user@domain,source_computer,destination) is the key
# and the timestamp is the value.  The triple is a string of a list.
#
# Since a dictionary's keys need to be unique, when a duplicate triple is introduced
# to the dictionary, the timestamp is replaced.  At the end of the program
# the first timestamp for each unique triple should remain, since each event is introduced
# in reverse chronological order.

retain_index = [(0,2),(3,5)] # Index for time, source_user@domain, source_computer, destination_computer only 

for i in reverse_readline("auth.txt"):    
    triple = [item for start, end in retain_index for item in i.split(',')[start:end]]
    first_auth[repr(triple[1:])] = triple[0]

Wall time: 56min 52s


In [5]:
len(first_auth)

1629247

In [111]:
first_auth_df = pd.DataFrame.from_dict(first_auth,orient='index') # Creata DF from dictionary
first_auth_df = first_auth_df.reset_index(level=0,inplace=False) # Move the index to a column
first_auth_df['index'] = first_auth_df['index'].apply(lambda x: ast.literal_eval(x)) # Convert from string to list
first_auth_df = pd.concat([first_auth_df, pd.DataFrame(columns=['user','source','dest'])], axis=1) # Add new columns
first_auth_df.user, first_auth_df.source, first_auth_df.dest = zip(*first_auth_df['index']) # Populate them
del first_auth_df['index'] # Remove 'index' column
first_auth_df.columns = ['time','source_user','source_computer','destination_computer'] # Rename columns
first_auth_df['time'] = first_auth_df['time'].astype(int) # Change the time values from strings to integers
first_auth_df = first_auth_df.sort_values('time') # Sort by ascending time values
first_auth_df = first_auth_df.reset_index(drop=True)

In [114]:
first_auth_df.head()

Unnamed: 0,time,source_user,source_computer,destination_computer
0,1,C625$@DOM1,C2052,C625
1,1,C1678$@DOM1,C625,C625
2,1,C1678$@DOM1,C1065,C1065
3,1,C922$@DOM1,C586,C586
4,1,C1678$@DOM1,C457,C457


In [115]:
first_auth_df.tail()

Unnamed: 0,time,source_user,source_computer,destination_computer
1629242,5010104,U9424@DOM1,C15753,C612
1629243,5010259,U8224@DOM1,C1385,C2327
1629244,5010626,U747@DOM1,C5785,C743
1629245,5010835,U8500@DOM1,C1128,C529
1629246,5010890,U8500@DOM1,C1128,C457


In [116]:
# Save file
first_auth_df.to_csv('first_auth.txt',header=None,sep=',')