In [None]:
import re
from datetime import datetime

import math
import pandas as pd

In [None]:
data = pd.read_excel('./dataset/Coffee Shop Sales.xlsx')
data.to_csv('./dataset/Coffee Shop Sales.csv', index=False)

In [None]:
data2 = pd.read_csv('./dataset/Coffee Shop Sales.csv')

In [None]:
columns = data2.columns
size_per_column = dict()
for column in columns:
    elements = data2[column].unique()
    elements_size = [len(str(elem)) for elem in elements]
    size_per_column[column] = max(elements_size)
print(size_per_column)
print(list(size_per_column.values()))
print(sum(list(size_per_column.values()))+len(list(size_per_column)))

In [None]:
def infer_type(value):
    # Check for integer
    if re.match(r'^[+-]?\d+$', value):
        return 'int'
    
    # Check for float
    elif re.match(r'^[+-]?\d*\.\d+$', value):
        return 'float'
    
    # Check for date (format: YYYY-MM-DD)
    elif re.match(r'^\d{4}-\d{2}-\d{2}$', value):
        try:
            datetime.strptime(value, '%Y-%m-%d')
            return 'date'
        except ValueError:
            return 'string'  # Caso não seja uma data válida
    
    # Check for time (format: HH:MM:SS)
    elif re.match(r'^\d{2}:\d{2}:\d{2}$', value):
        try:
            datetime.strptime(value, '%H:%M:%S')
            return 'time'
        except ValueError:
            return 'string'  # Caso não seja um horário válido
    
    # If none of the above, treat it as a string
    else:
        return 'string'

def infer_types_from_record(record, record_length):
    fields = record.strip().split(',')
    return [infer_type(field.strip()) for field in fields][:record_length]

def range_between_integers(start, end):
    ints = [i for i in range(int(start), int(end))]
    return ints

def range_between_dates(start, end):
    dates_in_datetime = pd.date_range(start=start,end=end).to_pydatetime().tolist()
    dates = [datetime.strftime(elem, '%Y-%m-%d') for elem in dates_in_datetime]
    return dates

def range_between_times(start, end):
    times_in_datetime = pd.date_range(start, end, freq="1s")
    times = [datetime.strftime(elem, '%H:%M:%S') for elem in times_in_datetime]
    return times

def generate_range(range_type, start, end):
    if range_type == 'int':
        return range_between_integers(start, end)
    elif range_type == 'date':
        return range_between_dates(start, end)
    elif range_type == 'time':
        return range_between_times(start, end)
    else:
        return -1

def check_interval(interval_type, start, end):
    if interval_type == 'int':
        if start == end or start > end:
            return -1
        return 0
    elif interval_type == 'date':
        start_date = datetime.strptime(start, '%Y-%m-%d')
        end_date = datetime.strptime(end, '%Y-%m-%d')

        if start_date == end_date:
            return -1
        elif start_date > end_date:
            return -1
        else:
            return 0
    elif interval_type == 'time':
        start_time = datetime.strptime(start, '%H:%M:%S')
        end_time = datetime.strptime(end, '%H:%M:%S')

        if start_time == end_time:
            return -1
        elif start_time > end_time:
            return -1
        else:
            return 0
    else:
        return -1

In [None]:
class Fixed_Size_Heap:
    def __init__(self, block_size, field_sizes, filename):
        self.block_size = block_size
        self.field_sizes = field_sizes
        self._set_record_size()
        self.blocks = []
        self._read_file(filename)
        self.deleted_records = []

    def _set_field_names(self, line):
        self.field_names = line.split(',')

    def _set_record_size(self):
        self.record_size = sum(self.field_sizes) + len(self.field_sizes)
    
    def _set_field_types(self):
        record = self.blocks[0][:self.record_size]
        self.field_types = infer_types_from_record(record, len(self.field_names))
    
    def _padding(self, field, field_id):
        diff = self.field_sizes[field_id] - len(field)
        padded_field = field + (' ' * diff)
        return padded_field

    def _format_record(self, record):
        formatted_record = ''
        fields = record.strip().split(',')
        for i in range(len(fields)):
            if len(fields[i]) < self.field_sizes[i]:
                padded_field = self._padding(fields[i], i)
                formatted_record += padded_field + ','
            else:
                formatted_record += fields[i] + ','
        return formatted_record

    def _write_record(self, record):
        if self.blocks == []:
            self.blocks.append(self._format_record(record))
        elif len(self.blocks[-1]) + self.record_size < self.block_size:
            self.blocks[-1] += self._format_record(record)
        else:
            self.blocks.append(self._format_record(record))

    def _read_file(self, filename):
        with open(filename, 'r') as file:
            self._set_field_names(file.readline())
            for record in file:
                self._write_record(record)
            self._set_field_types()
    
    def _search(self, field_id, value):
        field_size = self.field_sizes[field_id]
        number_of_records = math.floor(self.block_size / self.record_size)
        success = False
        for i in range(len(self.blocks)):
            if success and field_id == 0:
                break
            for j in range(0, number_of_records):
                offset = self.record_size * j
                if self.field_sizes[:field_id] != []:
                    offset += sum(self.field_sizes[:field_id]) + field_id
                field_value = self.blocks[i][offset:offset + field_size].strip()
                if field_value == '':
                    continue
                if field_value == value:
                    yield [i, j]
                    success = True
                    if field_id == 0:
                        break
        return [-1, -1]

    def _select(self, select_container, block_id, record_id):
        offset = self.record_size * record_id
        record = self.blocks[block_id][offset:offset + self.record_size]
        select_container.append(record)

    def select_by_single_primary_key(self, key):
        select_container = []
        for (i, j) in self._search(field_id=0, value=key):
            if i == -1 and j == -1:
                raise Exception('SelectionError: Primary Key nonexistent.')
            else:
                self._select(select_container=select_container, block_id=i, record_id=j)
        return select_container
    
    def select_by_multiple_primary_key(self, keys):
        select_container = []
        exception_counter = 0
        for key in keys:
            for (i, j) in self._search(field_id=0, value=key):
                if i == -1 and j == -1:
                    exception_counter += 1
                else:
                    self._select(select_container=select_container, block_id=i, record_id=j)
        if exception_counter == len(keys):
            raise Exception('SelectionError: Primary Keys nonexistent.')
        return select_container
    
    def select_by_field_interval(self, field, start, end):
        field_id = self.field_names.index(field)
        field_type = self.field_types[field_id]
        possible_field_interval = check_interval(interval_type=field_type, start=start, end=end)
        if possible_field_interval == -1:
            raise Exception('SelectionError: Field Interval incomputable.')
        value_range = generate_range(range_type=field_type, start=start, end=end)
        if value_range == -1:
            raise Exception('SelectionError: Field Interval incomputable.')
        select_container = []
        exception_counter = 0
        for value in value_range:
            for (i, j) in self._search(field_id=field_id, value=str(value)):
                if i == -1 and j == -1:
                    exception_counter += 1
                else:
                    self._select(select_container=select_container, block_id=i, record_id=j)
        if exception_counter == len(value_range):
            raise Exception('SelectionError: Requested Records nonexistent.')
        return select_container

    def _delete_record(self, block_id, record_id):
        offset = self.record_size * record_id
        head = self.blocks[block_id][:offset]
        body = ' ' * self.record_size
        tail = self.blocks[block_id][offset + self.record_size:]
        self.blocks[block_id] = head + body + tail
        self.deleted_records.append([block_id, record_id])

    def delete_record_by_primary_key(self, key):
        for (i, j) in self._search(field_id=0, value=key):
            if i == -1 and j == -1:
                raise Exception('DeleteError: Primary Key nonexistent.')
            else:
                self._delete_record(block_id=i, record_id=j)
    
    def delete_record_by_criterion(self, field, value):
        field_id = self.field_names.index(field)
        for (i, j) in self._search(field_id=field_id, value=value):
            if i == -1 and j == -1:
                raise Exception('DeleteError: Field Value nonexistent.')
            else:
                self._delete_record(block_id=i, record_id=j)

In [None]:
filename = './dataset/Coffee Shop Sales.csv'

In [None]:
myfile = Fixed_Size_Heap(
                    block_size=512,
                    field_sizes=list(size_per_column.values()),
                    filename=filename)

In [None]:
myfile.blocks[0]

In [None]:
myfile.delete_record_by_primary_key('3')

In [None]:
myfile.blocks[1]

In [None]:
myfile.delete_record_by_criterion(field='product_category', value='Coffee')

In [None]:
myfile.delete_record_by_criterion(field='transaction_date', value='2023-01-01')

In [None]:
select_container = myfile.select_by_single_primary_key('3')
print(select_container)

In [None]:
select_container = myfile.select_by_multiple_primary_key(['3', '5', '6'])
print(select_container)

In [None]:
select_container = myfile.select_by_field_interval(field='store_id', start='5', end='8')
print(select_container[:5])
print(len(select_container))

In [None]:
select_container = myfile.select_by_field_interval(field='transaction_date', start='2023-01-01', end='2023-01-02')
print(select_container[:5])
print(len(select_container))

In [None]:
select_container = myfile.select_by_field_interval(field='transaction_time', start='07:06:11', end='07:09:11')
print(select_container[:5])
print(len(select_container))

In [None]:
with open('test.txt', 'r+') as file:
    file.seek(10)
    file.write('Testing')

In [None]:
l = ['ada', 'aba', 'aca']

s = ','.join(str(elem) for elem in l)

print(s)

In [None]:
import gzip
import shutil

def comprimir_arquivo(nome_arquivo):
    with open(nome_arquivo, 'rb') as f_in:
        with gzip.open(nome_arquivo + '.gz', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f'Arquivo {nome_arquivo} comprimido para {nome_arquivo}.gz')

# Exemplo de uso
# comprimir_arquivo('./dataset/Coffee Shop Sales.txt')

In [None]:
def descomprimir_arquivo(nome_arquivo):
    with gzip.open(nome_arquivo, 'rb') as f_in:
        with open(nome_arquivo[:-3], 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print(f'Arquivo {nome_arquivo} descomprimido para {nome_arquivo[:-3]}')

# Exemplo de uso
descomprimir_arquivo('./dataset/Coffee Shop Sales.txt.gz')

In [None]:
import zipfile

def comprimir_arquivo_txt(arquivo_txt, arquivo_zip):
    with zipfile.ZipFile(arquivo_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(arquivo_txt, arcname=arquivo_txt)

# Exemplo de uso
comprimir_arquivo_txt('./dataset/Coffee Shop Sales 2.txt', 'exemplo.zip')

In [None]:
import zipfile

def recuperar_arquivo_txt(arquivo_zip, arquivo_destino):
    with zipfile.ZipFile(arquivo_zip, 'r') as zipf:
        zipf.extractall()
        # Extraindo apenas o arquivo txt
        arquivo_extraido = zipf.namelist()[0]  # obtendo o nome do primeiro arquivo no zip
        # Renomeando o arquivo extraído para o destino
        import os
        os.rename(arquivo_extraido, arquivo_destino)

# Exemplo de uso
recuperar_arquivo_txt('dataset/Coffee Shop Sales.zip', 'exemplo_recuperado.txt')

In [6]:
import tarfile

def compress_text_file(input_file, output_file):
    with tarfile.open(output_file, "w") as tar:
        tar.add(input_file, arcname=input_file)

# Exemplo de uso
input_txt_file = "exemplo.txt"  # Nome do seu arquivo .txt
output_tar_file = "exemplo.tar"  # Nome do arquivo compactado
compress_text_file(input_txt_file, output_tar_file)
print(f"Arquivo {input_txt_file} comprimido em {output_tar_file}.")

Arquivo exemplo.txt comprimido em exemplo.tar.


In [2]:
import tarfile

def decompress_tar_file(input_file, output_dir):
    with tarfile.open(input_file, "r") as tar:
        tar.extractall(path=output_dir)
        print(f"Arquivos extraídos para {output_dir}")

# Exemplo de uso
input_tar_file = "dataset/Coffee Shop Sales.tar"  # Nome do arquivo .tar que você gerou
output_directory = "./test"  # Diretório onde o arquivo .txt será extraído
decompress_tar_file(input_tar_file, output_directory)

Arquivos extraídos para ./test


In [1]:
txt_filepath = './test/dataset/Coffee Shop Sales.txt'

In [5]:
with open(txt_filepath, 'r') as file:
    lines = file.readlines()
    for i in range(0, 10):
        print(lines[i+3])

int,date,time,int,int,string,int,float,string,string,string

149116

4

213019

63905

2024-09-22 12:55:07

2024-09-22 12:55:07

################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################################

#####################################################################################################################################################################################################################################################################################################################################################################