In [1]:
import os
import re
import glob
import sqlite3
import pandas as pd
import numpy
import json
from unidecode import unidecode
from functools import lru_cache
import uuid
from Levenshtein import distance

In [2]:
# https://data.gov.ro/dataset/firme-inregistrate-la-registrul-comertului-pana-la-data-de-07-iulie-2024
csv_paths = glob.glob("firme-inregistrate-pana-la-07-iulie-2024/*")

In [3]:
# csv_nomeclator = csv_paths[1]
# nomeclator_df = pd.read_csv(csv_nomeclator, sep="|", encoding='latin2', encoding_errors='replace', on_bad_lines='skip')
# lista_nomeclator = nomeclator_df["COD"].tolist()

In [4]:
csvpath = csv_paths[3]
csvpath

'firme-inregistrate-pana-la-07-iulie-2024/4firme_radiate_cu_sediu_2024-07-07.bigcsv'

In [5]:
with open("localitati.json", "r") as f:
    localitati = json.load(f)

In [6]:
toate_judetele = list({i["judet"] for i in localitati})
toate_judetele_pattern = f"{'|'.join([t.replace(" ", "-") for t in toate_judetele])}"

In [7]:
def get_cui(line):
    
    cui = ""

    cui_8 = re.search(r"\d{8}", line)
    if cui_8:
        cui = cui_8.group(0)

    if not cui:
        cui_srl = re.search(r"SRL.*(\d+).J\d+", line)
        if cui_srl:
            cui = cui_srl.group(0)

    if not cui:
        cui_pfa = re.search(r"PERSOANA FIZICA AUTORIZATA.(\d+).F\d+", line)
        if cui_pfa:
            cui = cui_pfa.group(0)
    
    if not cui:
        cui_pfa_short = re.search(r"PFA.(\d+).F\d+", line)
        if cui_pfa_short:
            cui = cui_pfa_short.group(0)

    return cui

In [8]:
def get_nr_reg_com(line):
    
    nr_reg_com = re.search(r"ROONRC.*/\d+/\d\d\d\d", line)
    if nr_reg_com:
        nr_reg_com = nr_reg_com.group(0).replace("ROONRC.", "")
    else:
        nr_reg_com = ""

    return nr_reg_com
    

In [9]:
def get_nume(line):
    nume = ""
    
    nume_srl = re.search(r".* SRL", line)
    if nume_srl:
        nume = nume_srl.group(0)

    if not nume:
        nume_pfa_long = re.search(r".* PERSOANA FIZICA AUTORIZATA", line)
        if nume_pfa_long:
            nume = nume_pfa_long.group(0)

    if not nume:
        nume_pfa_short = re.search(r".* PFA", line)
        if nume_pfa_short:
            nume = nume_pfa_short.group(0)
    

    return nume

In [17]:
def get_code_location(adresa):
    
    # Get judet
    latest_match_loc = None
    latest_match_com = None
    latest_match_jud = None
    for loc in localitati:    
        pattern_judet = re.sub(r'\s+', r'[-\\s]+', loc["judet"].replace("-", " "))
        match_judet = re.search(pattern_judet, adresa, re.IGNORECASE)
        if not match_judet: continue
        latest_match_jud = loc
            
        if loc["comuna"]:
            match_comuna = re.search(loc["comuna"], adresa, re.IGNORECASE)
            if not match_comuna: continue
            latest_match_com = loc
        
        localitate = loc["localitate"]
        if "SECTOR" in localitate:
            localitate = localitate.replace("SECTOR", "SECTORUL")
            
        match_localitate = re.search(localitate, adresa, re.IGNORECASE)
        if match_localitate: 
            latest_match_loc = loc
            
    # print("judet", latest_match_jud)
    # print("comuna", latest_match_com)
    # print("localitate", latest_match_loc)
    
    if latest_match_loc:
        return latest_match_loc
    
    if latest_match_com:
        return latest_match_com
    
    if latest_match_jud:
        return latest_match_jud

    return None

In [18]:
def get_adresa(line, nr_reg_com):
    adresa = ""
    cod = {
        'value': '', 
        'cod': '', 
        'judet': '', 
        'comuna': '', 
        'localitate': ''
    }
    if nr_reg_com:
        adresa = line.split("ROONRC." + nr_reg_com)[1].replace("^", " ").strip()
        adresa = adresa.replace("***localitatea negasita***,", "").replace("***localitatea negasita***", "")
        coduri_nomeclator = adresa.split(" ")[0]
        doar_coduri = all([item.isdigit() for item in coduri_nomeclator.split(",")])
        if doar_coduri:
            adresa = adresa.replace(coduri_nomeclator, "").strip()
        adresa = re.sub(r"\s{2,}", " ", adresa)
        cod = get_code_location(adresa)
    
    return adresa, cod

In [12]:
def get_line_data(line):

    line = unidecode(line.decode("utf-8", errors="replace"))
    
    if "SRL^0^J" in line or "PFA^0^F" in line or "PERSOANA FIZICA AUTORIZATA^0^F" in line:
        return None

    cui = get_cui(line)

    if not cui.isdigit():
        return None
    
    nr_reg_com = get_nr_reg_com(line)
    nume = get_nume(line)
    adresa, cod = get_adresa(line, nr_reg_com)

    row = {
        "cui": cui,
        "nume": nume,
        "nr_reg_com": nr_reg_com, 
        "adresa": adresa,
    }

    if all(row.values()):
        if cod is not None: 
            row = {**row, **cod}
        return row

    return None

In [19]:
%%time

conn = sqlite3.connect('date_firme_nume_cui.db')

chunk_size = 50000
for csvpath in csv_paths:
    if "nomenclator_stari_firma" in csvpath:
        continue

    print(csvpath)
    
    with open(csvpath, "rb") as f:
        lines = f.readlines()

        rows = []
        
        for idx, line in enumerate(lines):
            if idx == 0: continue

            line_data = get_line_data(line)
    
            if line_data:
                rows.append(line_data)
            
            if len(rows) >= chunk_size:
                df = pd.DataFrame.from_records(rows)
                df.set_index('cui', inplace=True)
                df.to_sql('date_firme_nume_cui', conn, if_exists='append', index=True)
                rows = []
                
conn.close()


firme-inregistrate-pana-la-07-iulie-2024/1firme_neradiate_fara_sediu_2024-07-07.csv
firme-inregistrate-pana-la-07-iulie-2024/2firme_radiate_fara_sediu_2024-07-07.csv
firme-inregistrate-pana-la-07-iulie-2024/4firme_radiate_cu_sediu_2024-07-07.bigcsv
firme-inregistrate-pana-la-07-iulie-2024/3firme_neradiate_cu_sediu_2024-07-07_.bigcsv
CPU times: user 1d 6h 43min 55s, sys: 16.5 s, total: 1d 6h 44min 11s
Wall time: 1d 6h 44min 37s
