In [5]:
import numpy as np
import numpy as np
from IPython.display import display
import pandas as pd

class FileDAT:

    def __init__(self, filepath: str):
        self.filepath = filepath
        self.file = None

        self.header = pd.DataFrame
        self.x = np.array([])
        self.y = np.array([])
        
        self.content = ""
        self.name = ""
        self.num_rows = 0
        self.num_attr = 0

    def _extract_data(self, raw_data: list[str]):
        data = np.array([x.replace("\n", "").split(",") for x in raw_data])
        self.x = data[:,:-1]
        #Para ser possível concatenar (hstack) x e y adicionar uma dimensão ao y
        self.y = np.expand_dims(data[:,-1], 1)

    def _extract_header(self, raw_header: list[list[str]]):
        self.header = pd.DataFrame(raw_header, columns=["Attribute Name", "Domain", "Range"])

    def get_data(self):
        if not (self.file and self.content) or (self.num_rows == 0):
            raise AttributeError("File is not open")

        return np.hstack((self.x, self.y))

    def open(self):
        self.file = open(self.filepath, 'r')
        self.content = self.file.readlines()

    def interpret(self):
        if not (self.file and self.content):
            raise AttributeError("File is not open")
        
        self.name = self.content[0].split()[-1]
        raw_header: list[list[str]] = []

        for i, line in enumerate(self.content):
            if ("@attribute" in line):
                self.num_attr += 1
                line_aux = line.replace(", ", ",").replace("@attribute", "").split()
                
                if(len(line_aux) < 3):
                    line_aux.insert(1, "categorical")

                raw_header.append(line_aux)
        
            if("@data" in line):
                self.num_rows = len(self.content) - i - 1
                self._extract_data(self.content[i + 1:])
                break

        self._extract_header(raw_header)
        self.file.close()

    def info(self):
        if(self.num_attr == 0):
            raise ValueError("File was not interpreted")
            
        print(f"Name of the dataset: {self.name}")
        display(self.header)
        print(f"Number of features: {self.num_attr - 1}")
        print(f"Number of instances: {self.num_rows}")
        print(f"Number of classes: {len(np.unique(self.y))}")

    def to_csv(self, path: str):
        if(self.num_attr == 0):
            raise ValueError("File was not interpreted")

        csv_format = np.vstack((np.array(self.header["Attribute Name"]), self.get_data()))
        np.savetxt(path, np.array(csv_format),  delimiter = ",", fmt="%s")

        return csv_format


In [6]:
dat = FileDAT("./datasets/german.dat")
dat.open()
dat.interpret()
print(dat.get_data())
dat.info()
dat.to_csv("./datasets/dataset.csv")

[['A11' ' 6' ' A34' ... ' A192' ' A201' ' 1']
 ['A12' ' 48' ' A32' ... ' A191' ' A201' ' 2']
 ['A14' ' 12' ' A34' ... ' A191' ' A201' ' 1']
 ...
 ['A12' ' 60' ' A32' ... ' A191' ' A201' ' 2']
 ['A14' ' 24' ' A32' ... ' A191' ' A201' ' 2']
 ['A12' ' 30' ' A34' ... ' A191' ' A201' ' 2']]
Name of the dataset: german


Unnamed: 0,Attribute Name,Domain,Range
0,StatusAccount,categorical,"{A11,A12,A14,A13}"
1,DurationMonth,integer,"[4,72]"
2,CreditHistory,categorical,"{A34,A32,A33,A30,A31}"
3,Purpose,categorical,"{A43,A46,A42,A40,A41,A49,A44,A45,A410,A48}"
4,CreditAmount,integer,"[250,18424]"
5,SavingsAccount,categorical,"{A65,A61,A63,A64,A62}"
6,EmploymentSince,categorical,"{A75,A73,A74,A71,A72}"
7,InstallmentRate,integer,"[1,4]"
8,StatusAndSex,categorical,"{A93,A92,A91,A94}"
9,Guarantors,categorical,"{A101,A103,A102}"


Number of features: 20
Number of instances: 1000
Number of classes: 2


array([['StatusAccount', 'DurationMonth', 'CreditHistory', ...,
        'Telephone', 'ForeignWorker', 'Customer'],
       ['A11', ' 6', ' A34', ..., ' A192', ' A201', ' 1'],
       ['A12', ' 48', ' A32', ..., ' A191', ' A201', ' 2'],
       ...,
       ['A12', ' 60', ' A32', ..., ' A191', ' A201', ' 2'],
       ['A14', ' 24', ' A32', ..., ' A191', ' A201', ' 2'],
       ['A12', ' 30', ' A34', ..., ' A191', ' A201', ' 2']], dtype=object)