In [11]:
import re
import requests
import pandas as pd
from datetime import datetime, timedelta
import time

class AresAPI:
    def __init__(self):
        self.base_url = "https://ares.gov.cz/ekonomicke-subjekty-v-be/rest/ekonomicke-subjekty/vyhledat"
        self.session = requests.Session()
        self.cookie = None
        self.cookie_expiry = None

    def refresh_cookie(self):
        response = self.session.get("https://ares.gov.cz/ekonomicke-subjekty")
        if response.status_code == 200:
            self.cookie = self.session.cookies.get("GN-TOKEN-CSP")
            self.cookie_expiry = datetime.now() + timedelta(hours=1)
            print("Cookie refreshed successfully")
        else:
            print(f"Failed to refresh cookie. Status code: {response.status_code}")

    def check_cookie(self):
        if not self.cookie or (self.cookie_expiry and datetime.now() > self.cookie_expiry):
            self.refresh_cookie()

    def search_subjects(self, payload):
        self.check_cookie()
        headers = {
            "Host": "ares.gov.cz",
            "Cookie": f"GN-TOKEN-CSP={self.cookie}",
            "Content-Type": "application/json",
            "Accept": "application/json, text/plain, */*",
            "Accept-Language": "en-US",
            "User-Agent": "Mozilla/5.0",
            "Origin": "https://ares.gov.cz",
            "Referer": "https://ares.gov.cz/ekonomicke-subjekty"
        }

        max_retries = 3
        for attempt in range(max_retries):
            response = self.session.post(self.base_url, json=payload, headers=headers)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 401:
                print("Unauthorized. Refreshing cookie and retrying...")
                self.refresh_cookie()
            else:
                print(f"Error: {response.status_code}. Retrying in 5 seconds...")
                import time
                time.sleep(5)

        print(f"Failed to get data after {max_retries} attempts")
        return None

def extract_subject_data(result, address):
    subjects = result.get('ekonomickeSubjekty', [])
    return [{'Name': subject.get('obchodniJmeno', ''), 
             'IČO': subject.get('ico', ''),
             'Address': address} for subject in subjects]

def format_address(payload):
    sidlo = payload['sidlo']
    address = f"{sidlo['cisloDomovni']}"
    if 'cisloOrientacni' in sidlo:
        address += f"/{sidlo['cisloOrientacni']}"
    if 'cisloOrientacniPismeno' in sidlo:
        address += sidlo['cisloOrientacniPismeno']
    return address

api = AresAPI()

payloads = [
    {"sidlo":{"cisloDomovni":1442,"cisloOrientacni":1,"cisloOrientacniPismeno":"b","kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1422,"cisloOrientacni":1,"cisloOrientacniPismeno":"a","kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1138,"cisloOrientacni":1,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":449661},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1552,"cisloOrientacni":58,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":456225},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1525,"cisloOrientacni":1,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":717592},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1461,"cisloOrientacni":2,"cisloOrientacniPismeno":"a","kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1481,"cisloOrientacni":4,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1559,"cisloOrientacni":5,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":730700},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1561,"cisloOrientacni":4,"cisloOrientacniPismeno":"a","kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1448,"cisloOrientacni":7,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":717592},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1449,"cisloOrientacni":9,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":717592},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1100,"cisloOrientacni":2,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":266,"cisloOrientacni":2,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":730700},"pocet":200,"start":0,"razeni":[]}
    
    # New payload added here
]

all_subjects = []

for payload in payloads:
    address = format_address(payload)
    result = api.search_subjects(payload)
    if result:
        subjects = extract_subject_data(result, address)
        all_subjects.extend(subjects)
        print(f"Found {len(subjects)} subjects for address {address}")
    else:
        print(f"No data retrieved for address {address}")
    time.sleep(1)  # Add a 5-second delay between requests

df_ares = pd.DataFrame(all_subjects)
print(df_ares)

Cookie refreshed successfully
Found 5 subjects for address 1442/1b
Found 10 subjects for address 1422/1a
Found 14 subjects for address 1138/1
Found 11 subjects for address 1552/58
Found 58 subjects for address 1525/1
Found 12 subjects for address 1461/2a
Found 16 subjects for address 1481/4
Found 13 subjects for address 1559/5
Found 8 subjects for address 1561/4a
Found 2 subjects for address 1448/7
Found 22 subjects for address 1449/9
Found 5 subjects for address 1100/2
Found 10 subjects for address 266/2
                                             Name       IČO  Address
0                         Nadace MONETA Clementia  10730443  1442/1b
1                         MONETA Money Bank, a.s.  25672720  1442/1b
2                MONETA Stavební Spořitelna, a.s.  47115289  1442/1b
3                             MONETA Auto, s.r.o.  60112743  1442/1b
4                          MONETA Leasing, s.r.o.  60751606  1442/1b
..                                            ...       ...      ...
181  O

In [12]:
df_ares.to_csv("AKTUALNI_ares_data.csv", index=False)

In [49]:
df_ares_modified

Unnamed: 0,Name,IČO,Address
0,Nadace MONETA Clementia,10730443,1442/1b
1,"MONETA Money Bank, a.s.",25672720,1442/1b
2,"MONETA Stavební Spořitelna, a.s.",47115289,1442/1b
3,"MONETA Auto, s.r.o.",60112743,1442/1b
4,"MONETA Leasing, s.r.o.",60751606,1442/1b
...,...,...,...
181,"O2 IT Services s.r.o., odštěpný závod mluvii",21752109,266/2
182,Nadace O2,26700000,266/2
183,INTENS Corporation s.r.o.,28435575,266/2
184,O2 Czech Republic a.s.,60193336,266/2


In [5]:

# Save to CSV
df.to_csv('subjects_data.csv', index=False)
print("Data saved to subjects_data.csv")
print("Comparing the files to find any differences...")


# Load the CSV file
file_path = 'sidla_nas_prehled.csv'
nas_originalni_prehled = pd.read_csv(file_path, delimiter=';')

# Load the data from the API (assuming it's stored in a DataFrame called 'df')
# If it's not already in a DataFrame, we need to create it first
#df = pd.DataFrame(all_subjects)

# Function to clean and standardize company names

#def clean_name(name):
 #   name = str(name).lower().strip()
  #  name = re.sub(r'[^\w\s]', '', name)  # Remove punctuation
   # name = re.sub(r'\s+', ' ', name)     # Replace multiple spaces with single space
    #return name

# Clean company names in both DataFrames

#nas_originalni_prehled['Název_clean'] = nas_originalni_prehled['Název'].apply(clean_name)
#df['Name_clean'] = df['Name'].apply(clean_name)

# Ensure IČO is treated as string in both DataFrames
nas_originalni_prehled['IČO'] = nas_originalni_prehled['IČO'].astype(str)
df['IČO'] = df['IČO'].astype(str)

# Find companies in the CSV file that are not in the API data
in_csv_not_in_ares = nas_originalni_prehled[~nas_originalni_prehled['IČO'].isin(df['IČO'])]

# Find companies in the API data that are not in the CSV file
platne_ares_export = df[~df['IČO'].isin(nas_originalni_prehled['IČO'])]

# Find companies with matching IČO but different names
matching_ico_diff_name = pd.merge(nas_originalni_prehled, df, on='IČO', how='inner')
matching_ico_diff_name = matching_ico_diff_name[matching_ico_diff_name['Název'] != matching_ico_diff_name['Name']]

# Additional check: Find companies with matching names but different IČO
matching_name_diff_ico = pd.merge(nas_originalni_prehled, df, left_on='Název', right_on='Name', how='inner')
matching_name_diff_ico = matching_name_diff_ico[matching_name_diff_ico['IČO_x'] != matching_name_diff_ico['IČO_y']]

# Save the results to CSV files
in_csv_not_in_ares.to_csv('in_csv_not_in_ares.csv', index=False)
platne_ares_export.to_csv('platne_ares_export.csv', index=False)
matching_ico_diff_name.to_csv('matching_ico_diff_name.csv', index=False)
matching_name_diff_ico.to_csv('matching_name_diff_ico.csv', index=False)

# Print summary and examples
print(f"Total companies in original CSV: {len(nas_originalni_prehled)}")
print(f"Total companies in API data: {len(df)}")
print(f"Companies in CSV but not in API: {len(in_csv_not_in_ares)}")
print(f"Companies in API but not in CSV: {len(platne_ares_export)}")
print(f"Companies with matching IČO but different names: {len(matching_ico_diff_name)}")
print(f"Companies with matching names but different IČO: {len(matching_name_diff_ico)}")

print("\nExamples of companies in CSV but not in Ares:")
print(in_csv_not_in_ares[['IČO', 'Název']].head())

print("\nExamples of companies in Ares but not in CSV:")
print(platne_ares_export[['IČO', 'Name']].head())

print("\nExamples of companies with matching IČO but different names:")
print(matching_ico_diff_name[['IČO', 'Název', 'Name']].head())

print("\nExamples of companies with matching names but different IČO:")
print(matching_name_diff_ico[['IČO_x', 'IČO_y', 'Název', 'Name']].head())

# Additional data quality checks
print("\nDuplicate IČO in original CSV:")
print(nas_originalni_prehled['IČO'].duplicated().sum())

print("\nDuplicate IČO in API data:")
print(df['IČO'].duplicated().sum())

print("\nMissing values in original CSV:")
print(nas_originalni_prehled.isnull().sum())

print("\nMissing values in API data:")
print(df.isnull().sum())

Data saved to subjects_data.csv
Comparing the files to find any differences...
Total companies in original CSV: 202
Total companies in API data: 186
Companies in CSV but not in API: 74
Companies in API but not in CSV: 61
Companies with matching IČO but different names: 19
Companies with matching names but different IČO: 40

Examples of companies in CSV but not in Ares:
        IČO                            Název
5    175439  G4S Secure Solutions (CZ), a.s.
6   1558676              G4S Services s.r.o.
10  2376211                 NEWPS HOLDING SE
11  3136108           NEWPS MARKETING s.r.o.
12  5205000            INEXAD Fashion s.r.o.

Examples of companies in Ares but not in CSV:
        IČO                             Name
5  00175439  G4S Secure Solutions (CZ), a.s.
6  01558676              G4S Services s.r.o.
7  02376211                 NEWPS HOLDING SE
8  03136108           NEWPS MARKETING s.r.o.
9  05205000            INEXAD Fashion s.r.o.

Examples of companies with matching IČO 

In [6]:
# trying to verify the results
# Load the CSV file
file_path = 'sidla_nas_prehled.csv'
nas_originalni_prehled = pd.read_csv(file_path, delimiter=';')

# Ensure IČO is treated as string in both DataFrames
nas_originalni_prehled['IČO'] = nas_originalni_prehled['IČO'].astype(str).str.strip()
df['IČO'] = df['IČO'].astype(str).str.strip()

# Remove any rows with empty or NaN IČO
nas_originalni_prehled = nas_originalni_prehled.dropna(subset=['IČO'])
df = df.dropna(subset=['IČO'])

# Check for duplicates
print("Duplicate IČO in original CSV:", nas_originalni_prehled['IČO'].duplicated().sum())
print("Duplicate IČO in API data:", df['IČO'].duplicated().sum())

# Remove duplicates if any
nas_originalni_prehled = nas_originalni_prehled.drop_duplicates(subset=['IČO'])
df = df.drop_duplicates(subset=['IČO'])

# Find companies in the CSV file that are not in the API data
in_csv_not_in_ares = nas_originalni_prehled[~nas_originalni_prehled['IČO'].isin(df['IČO'])]

# Find companies in the API data that are not in the CSV file
platne_ares_export = df[~df['IČO'].isin(nas_originalni_prehled['IČO'])]

# Print summary
print(f"\nTotal companies in original CSV: {len(nas_originalni_prehled)}")
print(f"Total companies in API data: {len(df)}")
print(f"Companies in CSV but not in API: {len(in_csv_not_in_ares)}")
print(f"Companies in API but not in CSV: {len(platne_ares_export)}")

# Verify the results
total_diff = len(in_csv_not_in_ares) + len(platne_ares_export)
expected_diff = abs(len(nas_originalni_prehled) - len(df))
print(f"\nTotal differences: {total_diff}")
print(f"Expected difference: {expected_diff}")

if total_diff == expected_diff:
    print("The results are consistent.")
else:
    print("There's still an inconsistency in the results.")
    
    # Additional diagnostics
    all_ico = set(nas_originalni_prehled['IČO']).union(set(df['IČO']))
    for ico in all_ico:
        in_csv = ico in set(nas_originalni_prehled['IČO'])
        in_api = ico in set(df['IČO'])
        if in_csv != (not in_api):
            print(f"Inconsistency found for IČO {ico}: In CSV: {in_csv}, In API: {in_api}")

Duplicate IČO in original CSV: 12
Duplicate IČO in API data: 1

Total companies in original CSV: 190
Total companies in API data: 185
Companies in CSV but not in API: 66
Companies in API but not in CSV: 61

Total differences: 127
Expected difference: 5
There's still an inconsistency in the results.
Inconsistency found for IČO 28200250: In CSV: True, In API: True
Inconsistency found for IČO 17070805: In CSV: True, In API: True
Inconsistency found for IČO 28200870: In CSV: True, In API: True
Inconsistency found for IČO 27452751: In CSV: True, In API: True
Inconsistency found for IČO 15272028: In CSV: True, In API: True
Inconsistency found for IČO 28200799: In CSV: True, In API: True
Inconsistency found for IČO 28199588: In CSV: True, In API: True
Inconsistency found for IČO 60456639: In CSV: True, In API: True
Inconsistency found for IČO 28444914: In CSV: True, In API: True
Inconsistency found for IČO 27080951: In CSV: True, In API: True
Inconsistency found for IČO 14030390: In CSV: True

In [8]:
## ALTERNATIVE APPROACH
import pandas as pd
import numpy as np

# Load the original CSV file
original_df = pd.read_csv('sidla_nas_prehled.csv', delimiter=';')

# Assuming df is already created from all_subjects
# df = pd.DataFrame(all_subjects)

# Ensure IČO is treated as string in both DataFrames and remove any leading/trailing whitespace
original_df['IČO'] = original_df['IČO'].astype(str).str.strip()
df['IČO'] = df['IČO'].astype(str).str.strip()

# Function to clean company names
def clean_name(name):
    return ' '.join(str(name).lower().split())

# Clean and standardize company names
original_df['Název_clean'] = original_df['Název'].apply(clean_name)
df['Name_clean'] = df['Name'].apply(clean_name)

# Merge DataFrames on IČO to compare all fields
merged_df = pd.merge(original_df, df, on='IČO', how='outer', indicator=True)

# Companies in original CSV but not in API data
in_csv_not_in_api = merged_df[merged_df['_merge'] == 'left_only']

# Companies in API data but not in original CSV
in_api_not_in_csv = merged_df[merged_df['_merge'] == 'right_only']

# Companies with matching IČO but different names
matching_ico_diff_name = merged_df[
    (merged_df['_merge'] == 'both') & 
    (merged_df['Název_clean'] != merged_df['Name_clean'])
]

# Print summary
print(f"Total companies in original CSV: {len(original_df)}")
print(f"Total companies in API data: {len(df)}")
print(f"Companies in CSV but not in API: {len(in_csv_not_in_api)}")
print(f"Companies in API but not in CSV: {len(in_api_not_in_csv)}")
print(f"Companies with matching IČO but different names: {len(matching_ico_diff_name)}")

# Save results to CSV files
in_csv_not_in_api[['IČO', 'Název']].to_csv('in_csv_not_in_api.csv', index=False)
in_api_not_in_csv[['IČO', 'Name']].to_csv('in_api_not_in_csv.csv', index=False)
matching_ico_diff_name[['IČO', 'Název', 'Name']].to_csv('matching_ico_diff_name.csv', index=False)

# Display a few examples from each category
print("\nExamples of companies in CSV but not in API:")
print(in_csv_not_in_api[['IČO', 'Název']].head())

print("\nExamples of companies in API but not in CSV:")
print(in_api_not_in_csv[['IČO', 'Name']].head())

print("\nExamples of companies with matching IČO but different names:")
print(matching_ico_diff_name[['IČO', 'Název', 'Name']].head())

# Additional data quality checks
print("\nDuplicate IČO in original CSV:")
print(original_df['IČO'].duplicated().sum())

print("\nDuplicate IČO in API data:")
print(df['IČO'].duplicated().sum())

print("\nMissing values in original CSV:")
print(original_df.isnull().sum())

print("\nMissing values in API data:")
print(df.isnull().sum())

Total companies in original CSV: 202
Total companies in API data: 186
Companies in CSV but not in API: 73
Companies in API but not in CSV: 61
Companies with matching IČO but different names: 18

Examples of companies in CSV but not in API:
           IČO                      Název
5   027 38 252           Nej Kanál s.r.o.
11    03623068          Půdy Libeň s.r.o.
49     1051717  MUDr. Monika Vondráková\t
63     1519506    Affirmed Networks Czech
66     1558676        G4S Services s.r.o.

Examples of companies in API but not in CSV:
        IČO                             Name
0  00175439  G4S Secure Solutions (CZ), a.s.
1  01051717          MUDr. Monika Vondráková
2  01558676              G4S Services s.r.o.
3  01838989      Accalio s.r.o., v likvidaci
4  02376211                 NEWPS HOLDING SE

Examples of companies with matching IČO but different names:
          IČO                                              Název  \
55   10988157  Český Gastronomický Institut servisní s.r.o./F.

In [9]:
in_api_not_in_csv

Unnamed: 0,Building,IČO,Název,Adresa,Unnamed: 4,Název_clean,Name,Address,Name_clean,_merge
0,,00175439,,,,,"G4S Secure Solutions (CZ), a.s.",1422/1a,"g4s secure solutions (cz), a.s.",right_only
1,,01051717,,,,,MUDr. Monika Vondráková,1138/1,mudr. monika vondráková,right_only
2,,01558676,,,,,G4S Services s.r.o.,1422/1a,g4s services s.r.o.,right_only
3,,01838989,,,,,"Accalio s.r.o., v likvidaci",1481/4,"accalio s.r.o., v likvidaci",right_only
4,,02376211,,,,,NEWPS HOLDING SE,1422/1a,newps holding se,right_only
...,...,...,...,...,...,...,...,...,...,...
125,,26196581,,,,,"ARAMARK - zařízení školního stravování, s.r.o.",1138/1,"aramark - zařízení školního stravování, s.r.o.",right_only
154,,27737586,,,,,Medac Gesellschaft für klinische Spezialpräpar...,1525/1,medac gesellschaft für klinische spezialpräpar...,right_only
155,,27898253,,,,,"OpenMIND Networks, organizační složka",1449/9,"openmind networks, organizační složka",right_only
179,,28238362,,,,,"BRÝLE - ČOČKY, s.r.o.",1100/2,"brýle - čočky, s.r.o.",right_only


In [44]:
# ONLY COMPARE IČO

# Load the original CSV file
file_path = 'MODIFIED_sidla_nas_prehled.csv'
original_df = pd.read_csv(file_path, delimiter=';')

# Load the API response data (assuming it's already in a DataFrame called 'df')
# If it's not, uncomment the following line:
# df = pd.DataFrame(all_subjects)

# Ensure IČO is treated as string in both DataFrames and remove any leading/trailing whitespace
original_df_modified = original_df.copy()
original_df_modified['IČO'] = original_df_modified['IČO'].astype(str).str.strip()
df_ares_modified = df_ares.copy()
df_ares_modified['IČO'] = df_ares_modified['IČO'].astype(str).str.strip()

# Find IČO numbers in the API data that are not in the original CSV
ico_in_api_not_in_csv = df[~df_ares_modified['IČO'].isin(original_df['IČO'])]

# Select only the IČO and Name columns
ico_in_api_not_in_csv = ico_in_api_not_in_csv[['IČO', 'Name']]

# Save the results to a CSV file
ico_in_api_not_in_csv.to_csv('ico_in_api_not_in_original_csv.csv', index=False)

# Print summary
print(f"Total IČO numbers in original CSV: {len(original_df)}")
print(f"Total IČO numbers in API data: {len(df)}")
print(f"IČO numbers in API but not in original CSV: {len(ico_in_api_not_in_csv)}")

# Display a few examples
print("\nExamples of IČO numbers in API but not in original CSV:")
print(ico_in_api_not_in_csv.head())

print("\nResults saved to 'ico_in_api_not_in_original_csv.csv'")

Total IČO numbers in original CSV: 202
Total IČO numbers in API data: 186
IČO numbers in API but not in original CSV: 61

Examples of IČO numbers in API but not in original CSV:
        IČO                             Name
5  00175439  G4S Secure Solutions (CZ), a.s.
6  01558676              G4S Services s.r.o.
7  02376211                 NEWPS HOLDING SE
8  03136108           NEWPS MARKETING s.r.o.
9  05205000            INEXAD Fashion s.r.o.

Results saved to 'ico_in_api_not_in_original_csv.csv'


In [30]:
df_ares_modified

Unnamed: 0,Name,IČO,Address
0,Nadace MONETA Clementia,10730443,1442/1b
1,"MONETA Money Bank, a.s.",25672720,1442/1b
2,"MONETA Stavební Spořitelna, a.s.",47115289,1442/1b
3,"MONETA Auto, s.r.o.",60112743,1442/1b
4,"MONETA Leasing, s.r.o.",60751606,1442/1b
...,...,...,...
181,"O2 IT Services s.r.o., odštěpný závod mluvii",21752109,266/2
182,Nadace O2,26700000,266/2
183,INTENS Corporation s.r.o.,28435575,266/2
184,O2 Czech Republic a.s.,60193336,266/2


In [34]:
# if the "IČO" number is less than 8 characters, add leading zeros to make it 8 characters long

df_ares_modified['IČO'] = df_ares_modified['IČO'].str.zfill(8)
# remove all thle " characters from the "Name" column
df_ares_modified['Name'] = df_ares_modified['Name'].str.replace('"', '')

original_df_modified['IČO'] = original_df_modified['IČO'].str.zfill(8)
# remove all thle " characters from the "Name" column
original_df_modified['Název'] = original_df_modified['Název'].str.replace('"', '')

In [38]:
original_df_modified

Unnamed: 0,Building,IČO,Název,Adresa,Unnamed: 4
0,A,25672720,"MONETA Money Bank, a.s.","Praha 4, Michle, Vyskočilova 1442/1b",
1,A,47115289,"MONETA Stavební Spořitelna, a.s.","Praha 4, Michle, Vyskočilova 1442/1b",
2,A,60112743,"MONETA Auto, s.r.o.","Praha 4, Michle, Vyskočilova 1442/1b",
3,A,60751606,"MONETA Leasing, s.r.o.","Praha 4, Michle, Vyskočilova 1442/1b",
4,A,10730443,Nadace MONETA Clementia,"Praha 4, Michle, Vyskočilova 1442/1b",
...,...,...,...,...,...
197,Villas,00000nan,DFG Hotel Management Praha,"Praha 4, Michle, Želetavská 1449/9",
198,D,01051717,MUDr. Monika Vondráková\t,"Jemnická 1138/1, Michle, 14000 Praha 4",
199,D,10988157,"FBF Czech, s.r.o.","Jemnická 1138/1, Michle, 14000 Praha 4",
200,Filadelfie,03623068,Půdy Libeň s.r.o.,Želetavská 1525/1,


In [45]:
ico_in_api_not_in_csv = df[~df_ares_modified['IČO'].isin(original_df_modified['IČO'])]

# Select only the IČO and Name columns
ico_in_api_not_in_csv = ico_in_api_not_in_csv[['IČO', 'Name']]

# Save the results to a CSV file
ico_in_api_not_in_csv.to_csv('ico_in_api_not_in_original_csv.csv', index=False)

# Print summary
print(f"Total IČO numbers in original CSV: {len(original_df_modified)}")
print(f"Total IČO numbers in API data: {len(df)}")
print(f"IČO numbers in API but not in original CSV: {len(ico_in_api_not_in_csv)}")

# Display a few examples
print("\nExamples of IČO numbers in API but not in original CSV:")
print(ico_in_api_not_in_csv.head())

print("\nResults saved to 'ico_in_api_not_in_original_csv.csv'")

Total IČO numbers in original CSV: 202
Total IČO numbers in API data: 186
IČO numbers in API but not in original CSV: 61

Examples of IČO numbers in API but not in original CSV:
        IČO                             Name
5  00175439  G4S Secure Solutions (CZ), a.s.
6  01558676              G4S Services s.r.o.
7  02376211                 NEWPS HOLDING SE
8  03136108           NEWPS MARKETING s.r.o.
9  05205000            INEXAD Fashion s.r.o.

Results saved to 'ico_in_api_not_in_original_csv.csv'


In [42]:
ico_in_api_not_in_csv

Unnamed: 0,IČO,Name
20,21842469,GreatIT s.r.o.
23,26196581,"ARAMARK - zařízení školního stravování, s.r.o."
48,7575696,We hate ironing s.r.o.
77,27737586,Medac Gesellschaft für klinische Spezialpräpar...
94,63984211,Emerson Automation Fluid Control & Pneumatics ...
100,19333650,"Grid Design, s.r.o."
133,19568053,SYNTECA a.s.
142,18631975,"ELECTROLUX, s.r.o."
154,8757461,AETON Reventures s.r.o.
156,11729295,Straight North s.r.o.


In [47]:
## FULL WORKING CODE

import re
import requests
import pandas as pd
from datetime import datetime, timedelta
import time

class AresAPI:
    def __init__(self):
        self.base_url = "https://ares.gov.cz/ekonomicke-subjekty-v-be/rest/ekonomicke-subjekty/vyhledat"
        self.session = requests.Session()
        self.cookie = None
        self.cookie_expiry = None

    def refresh_cookie(self):
        response = self.session.get("https://ares.gov.cz/ekonomicke-subjekty")
        if response.status_code == 200:
            self.cookie = self.session.cookies.get("GN-TOKEN-CSP")
            self.cookie_expiry = datetime.now() + timedelta(hours=1)
            print("Cookie refreshed successfully")
        else:
            print(f"Failed to refresh cookie. Status code: {response.status_code}")

    def check_cookie(self):
        if not self.cookie or (self.cookie_expiry and datetime.now() > self.cookie_expiry):
            self.refresh_cookie()

    def search_subjects(self, payload):
        self.check_cookie()
        headers = {
            "Host": "ares.gov.cz",
            "Cookie": f"GN-TOKEN-CSP={self.cookie}",
            "Content-Type": "application/json",
            "Accept": "application/json, text/plain, */*",
            "Accept-Language": "en-US",
            "User-Agent": "Mozilla/5.0",
            "Origin": "https://ares.gov.cz",
            "Referer": "https://ares.gov.cz/ekonomicke-subjekty"
        }

        max_retries = 3
        for attempt in range(max_retries):
            response = self.session.post(self.base_url, json=payload, headers=headers)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 401:
                print("Unauthorized. Refreshing cookie and retrying...")
                self.refresh_cookie()
            else:
                print(f"Error: {response.status_code}. Retrying in 5 seconds...")
                import time
                time.sleep(5)

        print(f"Failed to get data after {max_retries} attempts")
        return None

def extract_subject_data(result, address):
    subjects = result.get('ekonomickeSubjekty', [])
    return [{'Name': subject.get('obchodniJmeno', ''), 
             'IČO': subject.get('ico', ''),
             'Address': address} for subject in subjects]

def format_address(payload):
    sidlo = payload['sidlo']
    address = f"{sidlo['cisloDomovni']}"
    if 'cisloOrientacni' in sidlo:
        address += f"/{sidlo['cisloOrientacni']}"
    if 'cisloOrientacniPismeno' in sidlo:
        address += sidlo['cisloOrientacniPismeno']
    return address

api = AresAPI()

payloads = [
    {"sidlo":{"cisloDomovni":1442,"cisloOrientacni":1,"cisloOrientacniPismeno":"b","kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1422,"cisloOrientacni":1,"cisloOrientacniPismeno":"a","kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1138,"cisloOrientacni":1,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":449661},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1552,"cisloOrientacni":58,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":456225},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1525,"cisloOrientacni":1,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":717592},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1461,"cisloOrientacni":2,"cisloOrientacniPismeno":"a","kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1481,"cisloOrientacni":4,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1559,"cisloOrientacni":5,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":730700},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1561,"cisloOrientacni":4,"cisloOrientacniPismeno":"a","kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1448,"cisloOrientacni":7,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":717592},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1449,"cisloOrientacni":9,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":717592},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":1100,"cisloOrientacni":2,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":478652},"pocet":200,"start":0,"razeni":[]},
    {"sidlo":{"cisloDomovni":266,"cisloOrientacni":2,"kodObce":554782,"kodMestskeCastiObvodu":500119,"kodUlice":730700},"pocet":200,"start":0,"razeni":[]}
    
    # New payload added here
]

all_subjects = []

for payload in payloads:
    address = format_address(payload)
    result = api.search_subjects(payload)
    if result:
        subjects = extract_subject_data(result, address)
        all_subjects.extend(subjects)
        print(f"Found {len(subjects)} subjects for address {address}")
    else:
        print(f"No data retrieved for address {address}")
    time.sleep(1)  # Add a 5-second delay between requests

df_ares = pd.DataFrame(all_subjects)
print(df_ares)



# ONLY COMPARE IČO

# Load the original CSV file
file_path = 'MODIFIED_sidla_nas_prehled.csv'
original_df = pd.read_csv(file_path, delimiter=';')

# Load the API response data (assuming it's already in a DataFrame called 'df')
# If it's not, uncomment the following line:
# df = pd.DataFrame(all_subjects)

# Ensure IČO is treated as string in both DataFrames and remove any leading/trailing whitespace
original_df_modified = original_df.copy()
original_df_modified['IČO'] = original_df_modified['IČO'].astype(str).str.strip()
df_ares_modified = df_ares.copy()
df_ares_modified['IČO'] = df_ares_modified['IČO'].astype(str).str.strip()

# Find IČO numbers in the API data that are not in the original CSV
ico_in_api_not_in_csv = df[~df_ares_modified['IČO'].isin(original_df['IČO'])]

# Select only the IČO and Name columns
ico_in_api_not_in_csv = ico_in_api_not_in_csv[['IČO', 'Name']]

# Save the results to a CSV file
ico_in_api_not_in_csv.to_csv('ico_in_api_not_in_original_csv.csv', index=False)

# Print summary
print(f"Total IČO numbers in original CSV: {len(original_df)}")
print(f"Total IČO numbers in API data: {len(df)}")
print(f"IČO numbers in API but not in original CSV: {len(ico_in_api_not_in_csv)}")

# Display a few examples
print("\nExamples of IČO numbers in API but not in original CSV:")
print(ico_in_api_not_in_csv.head())

print("\nResults saved to 'ico_in_api_not_in_original_csv.csv'")


# if the "IČO" number is less than 8 characters, add leading zeros to make it 8 characters long

df_ares_modified['IČO'] = df_ares_modified['IČO'].str.zfill(8)
# remove all thle " characters from the "Name" column
df_ares_modified['Name'] = df_ares_modified['Name'].str.replace('"', '')

original_df_modified['IČO'] = original_df_modified['IČO'].str.zfill(8)
# remove all thle " characters from the "Name" column
original_df_modified['Název'] = original_df_modified['Název'].str.replace('"', '')



ico_in_api_not_in_csv = df[~df_ares_modified['IČO'].isin(original_df_modified['IČO'])]

# Select only the IČO and Name columns
ico_in_api_not_in_csv = ico_in_api_not_in_csv[['IČO', 'Name']]

# Save the results to a CSV file
ico_in_api_not_in_csv.to_csv('ico_in_api_not_in_original_csv.csv', index=False)

# Print summary
print(f"Total IČO numbers in original CSV: {len(original_df_modified)}")
print(f"Total IČO numbers in API data: {len(df)}")
print(f"IČO numbers in API but not in original CSV: {len(ico_in_api_not_in_csv)}")

# Display a few examples
print("\nExamples of IČO numbers in API but not in original CSV:")
print(ico_in_api_not_in_csv.head())

print("\nResults saved to 'ico_in_api_not_in_original_csv.csv'")



Cookie refreshed successfully
Found 5 subjects for address 1442/1b
Found 10 subjects for address 1422/1a
Found 14 subjects for address 1138/1
Found 11 subjects for address 1552/58
Found 58 subjects for address 1525/1
Found 12 subjects for address 1461/2a
Found 16 subjects for address 1481/4
Found 13 subjects for address 1559/5
Found 8 subjects for address 1561/4a
Found 2 subjects for address 1448/7
Found 22 subjects for address 1449/9
Found 5 subjects for address 1100/2
Found 10 subjects for address 266/2
                                             Name       IČO  Address
0                         Nadace MONETA Clementia  10730443  1442/1b
1                         MONETA Money Bank, a.s.  25672720  1442/1b
2                MONETA Stavební Spořitelna, a.s.  47115289  1442/1b
3                             MONETA Auto, s.r.o.  60112743  1442/1b
4                          MONETA Leasing, s.r.o.  60751606  1442/1b
..                                            ...       ...      ...
181  O