# Utility library

This **CSV** class is to parse a csv file and dump them into json file.

## Initialization

The class gets initialized by passing a *filepath* of the csv file. It tries to parse the tokens not only by splitting for ','s but also matching '"'s. If a line is ill-formed then it is ignored.

## Serialization

The member function **serialize()** is to create a python dictionary from the parsed context. It just iterates through the header of csv file and then for each header token it puts down all cells related to that header as its members. For example, <code>'{h1:{0:a, 1:b}, h2:{0:c, 1:d}}'</code>.

## Jsonizing

The member function **to_json(filepath)** is to create a json file which stores the given dictionary(in this case, obtained by a csv file).

In [1]:
class CSV:
    def __init__(self, filepath):
        self.header = []
        self.context = []
        self.n_lines = 0
        
        with open(filepath, 'r', encoding='utf-8') as file:
            self.header = file.readline().split(',')
            self.context = []

            # Parsing the context
            line = file.readline()
            while len(line) > 0:
                self.n_lines += 1 # line counter
                beg = 0
                tokens = [] # to hold the parsed tokens of each line
                while beg < len(line):
                    end = line.find(',', beg, len(line))
                    if end == -1:
                        end = len(line)
                    
                    n = line.count('"', beg, end)
                    while n > 0 and n%2 == 1:
                        if line.count('"', end, len(line)) == 0:
                            break
                        i = line.find(',', end + 1, len(line))
                        n -= line.count('"', end, i)
                        end = i
                    tokens.append(line[beg:end])
                    beg = end + 1
                
                if len(tokens) == len(self.header): # Check if the line has been parsed correctly
                    self.context.append(tokens)
    #             else: # To see the ill-formed lines
    #                 print('at',n_line,':',line)
    #                 print(tokens)
                line = file.readline()
    
    def serialize(self):
        dictionary = {}
        
        for i, token in enumerate(self.header):
            self.header[i]=self.header[i].replace('\n', '')
            dictionary[self.header[i]] = []
        
        for line in self.context:
            for i, token in enumerate(line):
                if len(token) > 0:
                    token = token.replace('\n', '')
                    dictionary[self.header[i]].append(token)
        
        return dictionary
    
    def to_json(self, filepath):
        dictionary = self.serialize()
        string = '{'
        
        for i, key in enumerate(dictionary.keys()):
            string += ('\"' + key + '\":{')
            for j, value in enumerate(dictionary[key]):
                string += '\"' + str(j) + '\":\"' + value + '\"'
                if j < len(dictionary[key]) - 1:
                    string += ','
            string += '}'
            if i < len(dictionary.keys()) - 1:
                string += ','
        string += '}'
        
        if len(filepath) > 0:
            file = open(filepath, 'w', encoding='utf-8')
            file.write(string)
            file.close()

In [2]:
train = CSV('train.csv')
print('Number of parsed lines:',len(train.context),'out of',train.n_lines)
train.to_json('train.json')

test = CSV('test.csv')
print('Number of parsed lines:',len(test.context),' out of',test.n_lines)
test.to_json('test.json')

Number of parsed lines: 12104 out of 12120
Number of parsed lines: 5186  out of 5195
