In [None]:
import pandas as pd
import numpy as np
import re
import os
from google.colab import drive

 # Step 1: Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

# Step 2: Define Paths
metadata_file = "/content/datageninput.csv"  # Update path to metadata file
input_folder = "/content/drive/MyDrive/indium/App-Thon"  # Folder containing multiple input files

In [None]:

def load_column_metadata(metadata_path):
    """Load column metadata from the given file."""
    column_info_df = pd.read_csv(metadata_path)
    column_names = column_info_df['Column Name'].tolist()
    mandatory_cols = column_info_df[column_info_df['Requirement'] == 'Mandatory']['Column Name'].tolist()
    format_dict = column_info_df.set_index('Column Name')['Format'].dropna().to_dict()
    valid_values_dict = column_info_df.set_index('Column Name')['Valid Values'].dropna().to_dict()
    max_length_dict = column_info_df.set_index('Column Name')['max length'].dropna().to_dict()
    return column_names, mandatory_cols, format_dict, valid_values_dict, max_length_dict

def validate_input_data(input_file, metadata_file):
    """Validate the input file based on metadata."""
    column_names, mandatory_cols, format_dict, valid_values_dict, max_length_dict = load_column_metadata(metadata_file)
    input_df = pd.read_csv(input_file, header=None, names=column_names)  # Assign headers from metadata
    errors = []

    # Check for missing values in mandatory columns
    for col in mandatory_cols:
        if col in input_df.columns:
            missing_count = input_df[col].isna().sum()
            if missing_count > 0:
                errors.append(f"Column '{col}' has {missing_count} missing values but is mandatory.")
        else:
            errors.append(f"Mandatory column '{col}' is missing from the input file.")

    # Validate formats
    for col, pattern in format_dict.items():
        if col in input_df.columns:
            regex = re.compile(str(pattern))
            invalid_entries = input_df[~input_df[col].astype(str).str.match(regex, na=False)][col]
            if not invalid_entries.empty:
                errors.append(f"Column '{col}' has invalid format entries: {invalid_entries.unique()[:5]}")

    # Validate valid values
    for col, valid_vals in valid_values_dict.items():
        if col in input_df.columns:
            valid_list = valid_vals.split(',')  # Assuming values are comma-separated
            invalid_entries = input_df[~input_df[col].astype(str).isin(valid_list)][col]
            if not invalid_entries.empty:
                errors.append(f"Column '{col}' has invalid values: {invalid_entries.unique()[:5]}")

    # Validate max length
    for col, max_len in max_length_dict.items():
        if col in input_df.columns:
            too_long = input_df[input_df[col].astype(str).str.len() > int(max_len)][col]
            if not too_long.empty:
                errors.append(f"Column '{col}' has entries exceeding max length: {too_long.unique()[:5]}")

    return errors

# Process multiple input files
report = {}

for file_name in os.listdir(input_folder):
    if file_name.endswith(".csv"):  # Process only CSV files
        file_path = os.path.join(input_folder, file_name)
        print(f"\nValidating: {file_name}...")
        errors = validate_input_data(file_path, metadata_file)
        report[file_name] = errors

# Print Consolidated Report
for file, errors in report.items():
    print("\n----------------------------------")
    print(f"Validation Report for: {file}")
    print("----------------------------------")
    if errors:
        for err in errors:
            print("-", err)
    else:
        print("No validation errors found. Data is clean.")



Validating: input_file_33.csv...

Validating: input_file_32.csv...

Validating: input_file_30.csv...

Validating: input_file_31.csv...

Validating: input_file_34.csv...

Validating: input_file_23.csv...

Validating: input_file_27.csv...

Validating: input_file_25.csv...

Validating: input_file_26.csv...

Validating: input_file_24.csv...

Validating: input_file_28.csv...

Validating: input_file_19.csv...

Validating: input_file_20.csv...

Validating: input_file_21.csv...

Validating: input_file_29.csv...

Validating: input_file_22.csv...

Validating: input_file_18.csv...

Validating: input_file_11.csv...

Validating: input_file_15.csv...

Validating: input_file_10.csv...

Validating: input_file_14.csv...

Validating: input_file_13.csv...

Validating: input_file_17.csv...

Validating: input_file_16.csv...

Validating: input_file_12.csv...

----------------------------------
Validation Report for: input_file_33.csv
----------------------------------
- Column 'ClaimID' has 3 missing value