### 1 - Import Libraries and Load Dataset

In [None]:
# 1. Import Libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 2. Load Dataset
df = pd.read_csv("Company.csv")
# 3. Basic Structure
print("📊 Dataset Shape:", df.shape)
df.info()

### 2 - Profiling

In [None]:
# 4. Preview Data
df.head()

In [None]:
# 5. Summary Statistics (Including Strings)
df.describe(include='all')

In [None]:
# 6. Missing Values
missing = df.isnull().sum()
missing = missing[missing > 0].sort_values(ascending=False)
print("Columns with Missing Values:\n", missing)

In [None]:
# 7. Visualize Missing Data
plt.figure(figsize=(12, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="YlOrRd")
plt.title("Missing Values Heatmap")
plt.tight_layout()
plt.show()

In [None]:
# 9. Top Company Names (possible duplicates)
print("Top Repeated Company Names:\n", df["CompanyName"].value_counts().head(10))

# 10. Unique Value Counts Per Column (Optional deeper check)
unique_counts = df.nunique().sort_values(ascending=True)
print("Unique Values per Column:\n", unique_counts)

# 11. Column-specific Inspection (Optional: e.g., PostTown)
print(df["RegAddress.PostTown"].value_counts().head(10))


### 3 - Cleansing 

In [None]:
# 1. Drop columns where all values are NaN
df.dropna(axis=1, how='all', inplace=True)

# 2. Strip whitespace from column names (optional but helpful)
df.columns = df.columns.str.strip()

# 3. Trim whitespace in all string/object columns
str_cols = df.select_dtypes(include=['object']).columns
df[str_cols] = df[str_cols].apply(lambda x: x.str.strip())

cols_to_int = [
    'Accounts.AccountRefDay',
    'Accounts.AccountRefMonth',
    'Mortgages.NumMortCharges',
    'Mortgages.NumMortOutstanding',
    'Mortgages.NumMortPartSatisfied',
    'Mortgages.NumMortSatisfied',
    'LimitedPartnerships.NumGenPartners',
    'LimitedPartnerships.NumLimPartners'
]

for col in cols_to_int:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
        
# 4. Standardize casing (e.g., uppercase company name)
if 'CompanyName' in df.columns:
    df['CompanyName'] = df['CompanyName'].str.upper()
df = df.rename(columns={
    'Returns.NextDueDate': 'ReturnsNextDueDate',
    'Returns.LastMadeUpDate': 'ReturnsLastMadeUpDate'
})
# 5. Convert common date fields to datetime (ignore errors)
date_columns = [col for col in df.columns if 'Date' in col]
for col in date_columns:
    df[col] = pd.to_datetime(df[col], format='%d/%m/%Y', errors='coerce')
# 6. Show result
print("Cleansing completed. New shape:", df.shape)
# df.head()


### 4 - Deduplication

In [None]:
# Step 2: Group by CompanyNumber and merge fields
def merge_group(group):
    # Take the first non-null value from each column
    return group.ffill().bfill().iloc[0]

# Only apply if CompanyNumber column exists
if 'CompanyNumber' in df.columns:
    # Create merged DataFrame by group
    df_unique = df.groupby('CompanyNumber', as_index=False).apply(merge_group)

    # Reset index after groupby
    df_unique.reset_index(drop=True, inplace=True)

    print(f"Deduplication done. Reduced from {df.shape[0]} to {df_unique.shape[0]} rows.")
else:
    print("CompanyNumber column not found for deduplication.")

### 4 - Fetch data via Rest API and pre process it

In [None]:
import base64
from requests import get
# Your actual API key
api_key = os.environ.get("MY_API_KEY")

# Encode the API key using base64 for Basic Auth
auth_string = f"{api_key}:"
auth_bytes = auth_string.encode("utf-8")
auth_base64 = base64.b64encode(auth_bytes).decode("utf-8")

# Set Authorization header manually
headers = {
    "Authorization": f"Basic {auth_base64}",
    "Accept": "application/json"
}

In [None]:
import requests
import time

# Assume df contains a column 'CompanyNumber'
company_numbers = df_unique['CompanyNumber'].dropna().unique()

api_results = []

# Loop through each CompanyNumber
for number in company_numbers:
    try:
        url = f"http://data.companieshouse.gov.uk/doc/company/{number}"
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            json_data = response.json()

            if 'primaryTopic' in json_data:
                api_results.append(json_data)
                print(f"Fetched data for CompanyNumber: {number}")
            else:
                print(f"No 'primaryTopic' found in response for {number}")

        else:
            print(f"Failed to fetch {number} — Status Code: {response.status_code}")

        time.sleep(0.2)  # be nice to the server

    except Exception as e:
        print(f"Error fetching {number}: {e}")


In [None]:
# Flatten into list of rows
rows = []
for item in api_results:
    p = item['primaryTopic']
    row = {
        'CompanyName': p.get('CompanyName'),
        'CompanyNumber': p.get('CompanyNumber'),
        'AddressLine1': p.get('RegAddress', {}).get('AddressLine1'),
        'PostTown': p.get('RegAddress', {}).get('PostTown'),
        'Country': p.get('RegAddress', {}).get('Country'),
        'Postcode': p.get('RegAddress', {}).get('Postcode'),
        'CompanyCategory': p.get('CompanyCategory'),
        'CompanyStatus': p.get('CompanyStatus'),
        'CountryOfOrigin': p.get('CountryOfOrigin'),
        'IncorporationDate': p.get('IncorporationDate'),
        'AccountRefDay': p.get('Accounts', {}).get('AccountRefDay'),
        'AccountRefMonth': p.get('Accounts', {}).get('AccountRefMonth'),
        'NextAccountsDueDate': p.get('Accounts', {}).get('NextDueDate'),
        'LastAccountsMadeUpDate': p.get('Accounts', {}).get('LastMadeUpDate'),
        'ReturnsNextDueDate': p.get('Returns', {}).get('NextDueDate'),
        'ReturnsLastMadeUpDate': p.get('Returns', {}).get('LastMadeUpDate'),
        'SICCodes': ", ".join(p.get('SICCodes', {}).get('SicText', []))
    }
    rows.append(row)

# Create DataFrame
df_api = pd.DataFrame(rows)

# 2. Strip whitespace from column names (optional but helpful)
df_api.columns = df_api.columns.str.strip()

# 3. Trim whitespace in all string/object columns
str_cols = df_api.select_dtypes(include=['object']).columns
df_api[str_cols] = df_api[str_cols].apply(lambda x: x.str.strip())

# List of columns to convert 
date_columns = [
    'IncorporationDate',
    'NextAccountsDueDate',
    'LastAccountsMadeUpDate',
    'ReturnsNextDueDate',
    'ReturnsLastMadeUpDate'
]

# Apply conversion safely with error handling
for col in date_columns:
    if col in df_api.columns:
        df_api[col] = pd.to_datetime(df_api[col], format='%d/%m/%Y', errors='coerce')

cols_to_int = [
    'AccountRefDay',
    'AccountRefMonth',
   ]
for col in cols_to_int:
    if col in df_api.columns:
        df_api[col] = pd.to_numeric(df_api[col], errors='coerce').astype('Int64')


suffix = '_api'

df_api = df_api.rename(columns={
    col: col + suffix for col in df_api.columns if col != 'CompanyNumber'
})
# Show
# df_api.head()


### 5 - Merge both dfs to perform Matching and Validation

In [None]:
df_original=df_unique
# Ensure CompanyNumber is string and used as merge key
df_original['CompanyNumber'] = df_original['CompanyNumber'].astype(str).str.strip()
df_api['CompanyNumber'] = df_api['CompanyNumber'].astype(str).str.strip()

# Merge original and API data on CompanyNumber
merged_df = pd.merge(
    df_original, df_api,
    on='CompanyNumber',
    how='left'
)

### 6 - Matching

In [None]:
# Define your mappings
column_map = {
    'CompanyName': 'CompanyName_api',
    'RegAddress.AddressLine1': 'AddressLine1_api',
    'RegAddress.PostTown': 'PostTown_api',
    'RegAddress.Country': 'Country_api',
    'RegAddress.PostCode': 'Postcode_api',
    'CompanyCategory': 'CompanyCategory_api',
    'CompanyStatus': 'CompanyStatus_api',
    'CountryOfOrigin': 'CountryOfOrigin_api',
    'IncorporationDate': 'IncorporationDate_api',
    'Accounts.AccountRefDay': 'AccountRefDay_api',
    'Accounts.AccountRefMonth': 'AccountRefMonth_api',
    'Accounts.NextDueDate': 'NextAccountsDueDate_api',
    'Accounts.LastMadeUpDate': 'LastAccountsMadeUpDate_api',
    'ReturnsNextDueDate': 'ReturnsNextDueDate_api',
    'ReturnsLastMadeUpDate': 'ReturnsLastMadeUpDate_api'
}


In [None]:
validated_df = merged_df.copy()

for orig_col, api_col in column_map.items():
    qa_col = orig_col.split('.')[-1] + '_QA'  # cleaner QA column name

    def compare_values(row, orig=orig_col, api=api_col):
        val1 = row.get(orig)
        val2 = row.get(api)
        try:
            # 1. Return "Missing in both sources"
            if pd.isna(val1) and pd.isna(val2):
                return 'Missing in both sources'

            # 2. Missing in original only
            if pd.isna(val1) and pd.notna(val2):
                return 'Missing in original source'

            # 3. Missing in API only
            if pd.isna(val2) and pd.notna(val1):
                return 'Missing in api source'

            # 4. Exact match
            if val1 == val2:
                return "Matched"

            # 5. Mismatch
            return "Not Matched"
        except Exception as e:
            return "Error"

    validated_df[qa_col] = validated_df.apply(compare_values, axis=1)


In [None]:
# Show QA columns
qa_columns = [col for col in validated_df.columns if col.endswith('_QA')]
validated_df[qa_columns].sample(6)


### 7 - Enrichment

In [None]:
for orig_col, api_col in column_map.items():
    orig_full = f"{orig_col}_original"
    api_full = f"{api_col}_api"
    qa_col = orig_col.split('.')[-1] + '_QA'

    # Only update if QA failed (i.e., 'No')
    validated_df[orig_col] = validated_df.apply(
        lambda row: row[api_col] if row.get(qa_col) in ["Not Matched","Missing in original source"] and pd.notna(row[api_col]) else row[orig_col],
        axis=1
    )

# Drop columns that end with _api or _QA
enriched_df = validated_df.drop(columns=[
    col for col in validated_df.columns if col.endswith('_api') or col.endswith('_QA')
])


In [None]:
### incorporation age 
enriched_df['IncorporationDate'] = pd.to_datetime(enriched_df['IncorporationDate'], errors='coerce')
enriched_df['CompanyAgeYears'] = ((pd.Timestamp.today() - enriched_df['IncorporationDate']).dt.days // 365)

In [None]:
# Flag whether accounts or returns are overdue using NextDueDate
today = pd.Timestamp.today()
enriched_df['AccountsOverdue'] = pd.to_datetime(enriched_df['Accounts.NextDueDate'], errors='coerce') < today
enriched_df['ReturnsOverdue'] = pd.to_datetime(enriched_df['ReturnsNextDueDate'], errors='coerce') < today


In [None]:
## Add full address
enriched_df['FullAddress'] = enriched_df['RegAddress.AddressLine1'].fillna('') + ", " + \
                    enriched_df['RegAddress.PostTown'].fillna('') + ", " + \
                    enriched_df['RegAddress.PostCode'].fillna('')

### 8 - Reporting and visualisation

In [None]:
# Show how many records passed vs. failed for each field.

In [None]:
import matplotlib.pyplot as plt

qa_columns = [col for col in validated_df.columns if col.endswith('_QA')]

qa_summary = validated_df[qa_columns].apply(lambda col: col.value_counts().get('Not Matched', 0))
qa_summary.plot(kind='barh', color='salmon', figsize=(8, 6), title='Mismatches per Field')

plt.xlabel("Count of Mismatches")
plt.ylabel("Field")
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
# Count of companies with overdue filings

In [None]:
enriched_df['AccountsOverdue'] = pd.to_datetime(enriched_df['Accounts.NextDueDate'], errors='coerce') < pd.Timestamp.today()
enriched_df['ReturnsOverdue'] = pd.to_datetime(enriched_df['ReturnsNextDueDate'], errors='coerce') < pd.Timestamp.today()

overdue_counts = enriched_df[['AccountsOverdue', 'ReturnsOverdue']].sum()
overdue_counts

In [None]:
# Plot counts by country, postcode prefix, or city.
enriched_df['RegAddress.PostTown'].value_counts().head(10).plot(kind='barh', color='steelblue')
