# Alejandro Sánchez Monzón - Assesment Clarity AI (04/06/2024)

> Please read the README.md file to understand more about my intentions and thoughts during the assesment development. 

In [1]:
## Uncomment this line and install pandas in case you don't have it i your local environment.
## pip install pandas

In [2]:
# This line won't work if you didn't install pandas using the above command.
# https://pandas.pydata.org/docs/

import pandas as pd

In [3]:
# Fisrt of all, I found usefull to check if the timestamp received is valid or not.
def is_timestamp_valid(timestamp):
    try:
        pd.to_datetime(timestamp, unit='ms')
        return True
    except ValueError:
        return False

In [4]:
# I can check too if the file received is valid or not.
def is_file_valid(file_name):
    try:
        pd.read_csv(file_name, sep=' ', names=['unix_timestamp', 'hostname_1', 'hostname_2'])
        return True
    except Exception:
        return False

In [5]:
# Let's create the function with the logic required.

def parsing_tool(file_name, init_datetime, end_datetime, hostname):
    # I will check first if the timestamp received is valid or not.
    if not is_timestamp_valid(init_datetime) or not is_timestamp_valid(end_datetime):
        raise ValueError('The timestamp received is not valid.')

    # I will check if the file received is valid or not.
    if not is_file_valid(file_name):
        raise Exception('The file received is not valid.')

    # I need to parse to datetime object the init_datetime and end_datetime variables.
    init_datetime = pd.to_datetime(init_datetime, unit='ms')
    end_datetime = pd.to_datetime(end_datetime, unit='ms')

    # I will read all the lines of the file, naming the columns as the format given.
    df = pd.read_csv(file_name, sep=' ', names=['unix_timestamp', 'hostname_1', 'hostname_2'])

    time_parsed = pd.to_datetime(df['unix_timestamp'], unit='ms')

    # I am going to filter the dataframe based on the init_datetime and end_datetime variables.
    df_filtered = df[(time_parsed >= init_datetime) & (time_parsed <= end_datetime)]

    # Now, I continue filtering the dataframe based on the hostname variable.
    df_filtered = df_filtered[(df_filtered['hostname_2'] == hostname)]

    # We apply the unique() and tolist() functions to get a list with the unique hostnames (no duplicates).
    return df_filtered['hostname_1'].unique().tolist()

In [6]:
# At the end, we call the function with all the required parameters.

hostname = 'Taffani'
file_name = 'input-file-10000.txt'
init_datetime = '1565647518147'
end_datetime = '1565649996495'

In [7]:
df_result = parsing_tool(file_name, init_datetime, end_datetime, hostname)

df_result

  pd.to_datetime(timestamp, unit='ms')
  init_datetime = pd.to_datetime(init_datetime, unit='ms')
  end_datetime = pd.to_datetime(end_datetime, unit='ms')


['Chessica', 'Shaunee', 'Mehret']

## Alternative: using user inputs by console

In [8]:
hostname_input = input("Enter the hostname: ") #Alizander
end_datetime_input = input("Enter the end datetime (ms): ") #1565727297921
init_datetime_input = input("Enter the init datetime (ms): ") #1565656183599
file_name_input = input("Enter the file name: ") #input-file-10000.txt

df_result_inputs = parsing_tool(file_name_input, init_datetime_input, end_datetime_input, hostname_input)

df_result_inputs

  pd.to_datetime(timestamp, unit='ms')
  init_datetime = pd.to_datetime(init_datetime, unit='ms')
  end_datetime = pd.to_datetime(end_datetime, unit='ms')


['Kaitley',
 'Rashawna',
 'Zebastian',
 'Lacinda',
 'Jaylen',
 'Hashem',
 'Chasten',
 'Karletta',
 'Rukaya',
 'Giany',
 'Jitesh',
 'Arisleidy',
 'Edrin']

# Testing for this two methods

In [9]:
# As I said in the README.md, I am using Anaconda and I am not sure if it preinstall the unittest librery so y add the install command below.
# pip install unittest

In [10]:
# I import the testing library I am going to use to test my functions.

import unittest

In [11]:
# I create a class with my functions to test the original ones.

class TestParsingTool(unittest.TestCase):
    def test_is_timestamp_valid(self):
        test_file_name = 'input-file-10000.txt'
        test_end_datetime = '1565649996495'
        test_init_datetime = 'INVALID_TIMESTAMP'
        test_hostname = 'Taffani'

        with self.assertRaises(ValueError):
            parsing_tool(test_file_name, test_init_datetime, test_end_datetime, test_hostname)

    def test_is_file_valid(self):
        test_file_name = 'INVALID_FILE.txt'
        test_end_datetime = '1565649996495'
        test_init_datetime = '1565647518147'
        test_hostname = 'Taffani'

        with self.assertRaises(Exception):
            parsing_tool(test_file_name, test_init_datetime, test_end_datetime, test_hostname)

    def test_parsing_tool(self):
        test_file_name = 'input-file-10000.txt'
        test_end_datetime = '1565649996495'
        test_init_datetime = '1565647518147'
        test_hostname = 'Taffani'

        test_result = parsing_tool(test_file_name, test_init_datetime, test_end_datetime, test_hostname)
        self.assertIn('Shaunee', test_result)


unittest.main(argv=[''], exit=False)

  pd.to_datetime(timestamp, unit='ms')
  init_datetime = pd.to_datetime(init_datetime, unit='ms')
  end_datetime = pd.to_datetime(end_datetime, unit='ms')
.
----------------------------------------------------------------------
Ran 3 tests in 0.041s

OK


<unittest.main.TestProgram at 0x7f6856a17190>

# Optional requirement

In [75]:
# I will need some more libraries this time in order to use threading correctly and work with timestamps.

import pandas as pd
import time
from datetime import datetime, timedelta
import threading

In [76]:
file_name_realtime = 'input-file-10000.txt'
df_realtime = pd.DataFrame(columns=['unix_timestamp', 'hostname_1', 'hostname_2'])

In [77]:
# This function will be called in a thread and continuously update the dataframe.
def update_data():
    while True:
        # I had to use the global variable to access and update the dataframe in the thread.
        global df_realtime

        # I read the file and create a new dataframe with only the new information.
        new_df = pd.read_csv(file_name_realtime, sep=' ', names=['unix_timestamp', 'hostname_1', 'hostname_2'], skiprows=len(df_realtime))
        new_df['unix_timestamp'] = pd.to_datetime(new_df['unix_timestamp'], unit='ms')

        # I update the dataframe with the new information.
        df_realtime = pd.concat([df_realtime, new_df])


  df_realtime = pd.concat([df_realtime, new_df])


In [81]:
# This function will be called in a thread and will generate the report where I collect data such as hostnames connected, received and most connected.
def generate_report(last_timestamp=last_timestamp, df=df_realtime, hostname=hostname_realtime):
    while True:
        actual_time = datetime.now()

        # I check here if he hour is passed or not.
        if actual_time - last_timestamp >= timedelta(hours=1):
            last_timestamp = actual_time
            hour_aux = actual_time - timedelta(hours=1)

            # I create a Dataframe with the information of the last hour.
            last_hour_df = df[(df['unix_timestamp'] >= hour_aux) & (df['unix_timestamp'] <= actual_time)]

            hostnames_connected = last_hour_df[last_hour_df['hostname_1'] == hostname]['hostname_2'].unique().tolist()
            hostnames_received = last_hour_df[last_hour_df['hostname_2'] == hostname]['hostname_1'].unique().tolist()

            # idxmax() returns the index of the maximum value, but I need to ensure there are values.
            if not last_hour_df.empty:
                most_connections = last_hour_df['hostname_1'].value_counts().idxmax()
            else:
                most_connections = None

            print(f'Hostnames connected in the last hour: {hostnames_connected}')
            print(f'Hostnames received in the last hour: {hostnames_received}')
            print(f'Most connected in the last hour: {most_connections}')
            print('-------------------------------------------')

            # I decided to save the report in a csv file as well as I print the information.
            csv_title_time = int(actual_time.timestamp() * 1000)
            pd.DataFrame({'hostnames_connected': hostnames_connected, 'hostnames_received': hostnames_received, 'most_connected': most_connections}).to_csv(f'output/report_{csv_title_time}.csv', index=False)

        time.sleep(60 * 60)

In [82]:
# Now, I am ready to call the threads and generate the reports.
monitor_thread = threading.Thread(target=update_data)

# I start the threads.
monitor_thread.start()

In [83]:
last_timestamp = datetime.now() - timedelta(hours=1)
hostname_realtime = 'Taffani'

generate_report(last_timestamp, df_realtime, hostname_realtime)

Hostnames connected in the last hour: []
Hostnames received in the last hour: []
Most connected in the last hour: None
-------------------------------------------
