# Alejandro Sánchez Monzón - Assesment Clarity AI (04/06/2024)

> Please read the README.md file to understand more about my feelings and thoughts during the assesment development. 

In [206]:
## Uncomment this line and install pandas in case you don't have it i your local environment.
## pip install pandas

In [207]:
# This line won't work if you didn't install pandas using the above command.
# https://pandas.pydata.org/docs/

import pandas as pd

In [208]:
# Fisrt of all, I found usefull to check if the timestamp received is valid or not.
def is_timestamp_valid(timestamp):
    try:
        pd.to_datetime(timestamp, unit='ms')
        return True
    except ValueError:
        return False

In [209]:
# Let's create the function with the logic required.

def parsing_tool(file_name, init_datetime, end_datetime, hostname):
    # I will check first if the timestamp received is valid or not.
    if not is_timestamp_valid(init_datetime) or not is_timestamp_valid(end_datetime):
        raise ValueError('The timestamp received is not valid.')

    # I need to parse to datetime object the init_datetime and end_datetime variables.
    init_datetime = pd.to_datetime(init_datetime, unit='ms')
    end_datetime = pd.to_datetime(end_datetime, unit='ms')

    # I will read all the lines of the file, naming the columns as the format given.
    df = pd.read_csv(file_name, sep=' ', names=['unix_timestamp', 'hostname 1', 'hostname 2'])

    time_parsed = pd.to_datetime(df['unix_timestamp'], unit='ms')

    # I am going to filter the dataframe based on the init_datetime and end_datetime variables.
    df_filtered = df[(time_parsed >= init_datetime) & (time_parsed <= end_datetime)]

    # Now, I continue filtering the dataframe based on the hostname variable.
    df_filtered = df_filtered[(df_filtered['hostname 1'] == hostname) | (df_filtered['hostname 2'] == hostname)]

    return df_filtered

In [210]:
# At the end, we call the function with all the required parameters.

hostname = 'Taffani'
file_name = 'input-file-10000.txt'
init_datetime = '1565647518147'
end_datetime = '1565649996495'

In [211]:
df_result = parsing_tool(file_name, init_datetime, end_datetime, hostname)

df_result

  pd.to_datetime(timestamp, unit='ms')
  init_datetime = pd.to_datetime(init_datetime, unit='ms')
  end_datetime = pd.to_datetime(end_datetime, unit='ms')


Unnamed: 0,unix_timestamp,hostname 1,hostname 2
79,1565647824566,Chessica,Taffani
188,1565648881102,Shaunee,Taffani
280,1565649679572,Mehret,Taffani


In [212]:
# We show just a list, as is said in the requirements.
# In order to avoid duplicated values, I decided to use a set, which will trate the values as unique. I show the complete dataframe in the above cell with the rest of the information.

set(df_result['hostname 1'].to_list())

{'Chessica', 'Mehret', 'Shaunee'}

## Alternative: using user inputs by console

In [213]:
hostname_input = input("Enter the hostname: ")
end_datetime_input = input("Enter the end datetime (ms): ")
init_datetime_input = input("Enter the init datetime (ms): ")
file_name_input = input("Enter the file name: ")

df_result_inputs = parsing_tool(file_name_input, init_datetime_input, end_datetime_input, hostname_input)

df_result_inputs

  pd.to_datetime(timestamp, unit='ms')
  init_datetime = pd.to_datetime(init_datetime, unit='ms')
  end_datetime = pd.to_datetime(end_datetime, unit='ms')


Unnamed: 0,unix_timestamp,hostname 1,hostname 2
79,1565647824566,Chessica,Taffani
188,1565648881102,Shaunee,Taffani
280,1565649679572,Mehret,Taffani


# Testing for this two methods

In [214]:
# I import the testing library I am going to use to test my functions.

import unittest

In [221]:
# I create a class with my functions to test the original ones.

class TestParsingTool(unittest.TestCase):
    def test_is_timestamp_valid(self):
        test_file_name = 'input-file-10000.txt'
        test_end_datetime = '1565649996495'
        test_init_datetime = 'INVALID_TIMESTAMP'
        test_hostname = 'Taffani'

        with self.assertRaises(ValueError):
            parsing_tool(test_file_name, test_init_datetime, test_end_datetime, test_hostname)

    def test_parsing_tool(self):
        test_file_name = 'input-file-10000.txt'
        test_end_datetime = '1565649996495'
        test_init_datetime = '1565647518147'
        test_hostname = 'Taffani'

        test_result = parsing_tool(test_file_name, test_init_datetime, test_end_datetime, test_hostname)
        self.assertIn('Shaunee', test_result['hostname 1'].to_list())


unittest.main(argv=[''], exit=False)

  pd.to_datetime(timestamp, unit='ms')
  init_datetime = pd.to_datetime(init_datetime, unit='ms')
  end_datetime = pd.to_datetime(end_datetime, unit='ms')
.
----------------------------------------------------------------------
Ran 2 tests in 0.035s

OK


<unittest.main.TestProgram at 0x7f37f1ca7210>

# Optional requirement

In [242]:
# I will need some more libraries this time in order to use threading correctly and work with timestamps.

import pandas as pd
import time
from datetime import datetime, timedelta
import threading

In [243]:
class Monitor:
    def __init__(self, hostname, file_name):
        self.hostname = hostname
        self.file_name = file_name
        self.last_timestamp = datetime.now() - timedelta(days=1)
        self.df = pd.DataFrame(columns=['unix_timestamp', 'hostname 1', 'hostname 2'])

    # This function will be called in a thread and continuously update the dataframe.
    def update_data(self):
        while True:
                # I read the file and create a new dataframe with only the new information.
                new_df = pd.read_csv(self.file_name, sep=' ', names=['unix_timestamp', 'hostname 1', 'hostname 2'], skiprows=len(self.df))
                new_df['unix_timestamp'] = pd.to_datetime(new_df['unix_timestamp'], unit='ms')

                # I update the dataframe with the new information.
                self.df = pd.concat([self.df, new_df])

                time.sleep(60)

    # This function will be called in a thread and will generate the report where I collect data such as hostnames connected, received and most connected.
    def generate_report(self):
        while True:
            actual_time = datetime.now()

            if actual_time - self.last_timestamp >= timedelta(minutes=1):
                self.last_timestamp = actual_time
                hour_aux = actual_time - timedelta(hours=1)

                last_hour_df = self.df[(self.df['unix_timestamp'] >= hour_aux) & (self.df['unix_timestamp'] <= actual_time)]

                hostnames_connected = last_hour_df[last_hour_df['hostname 1'] == self.hostname]['hostname 2'].unique().tolist()
                hostnames_received = last_hour_df[last_hour_df['hostname 2'] == self.hostname]['hostname 1'].unique().tolist()

                if not last_hour_df.empty:
                    most_connections = last_hour_df['hostname 1'].value_counts().idxmax()
                else:
                    most_connections = None

                pd.DataFrame({'hostname 1': hostnames_connected, 'hostname 2': hostnames_received, 'most_connected': most_connections, 'last_timestamp': actual_time}).to_csv(f'output/report_{actual_time.timestamp()}.csv', index=False)

            time.sleep(60 * 60)

In [244]:
# I declared the variables for the monitor object.
file_name_realtime = 'input-file-10000.txt'
hostname_realtime = 'Taffani'

monitor = Monitor(hostname_realtime, file_name_realtime)

In [245]:
# Now, I am ready to call the threads and generate the reports.
monitor_thread = threading.Thread(target=monitor.update_data)
monitor_thread.start()

monitor.generate_report()

  self.df = pd.concat([self.df, new_df])


KeyboardInterrupt: 