In [None]:
# ---- Install required packages (run once in Colab) ----
'''
!pip install pytesseract pdf2image python-docx spacy pycountry python-dateutil
!python -m spacy download en_core_web_sm
'''

import pytesseract
from pdf2image import convert_from_path
import spacy
import pycountry
import re
import os
import zipfile
import pandas as pd
from docx import Document
from dateutil import parser
from google.colab import files

nlp = spacy.load("en_core_web_sm")

# ---- Helper Functions ----

def ocr_pdf(pdf_path):
    """Convert PDF to text using OCR"""
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img, lang="eng") + "\n"
    return text

def extract_docx(docx_path):
    """Extract text from DOCX"""
    doc = Document(docx_path)
    text = "\n".join([p.text for p in doc.paragraphs])
    return text

def extract_entities(text):
    """Extract NAME, DOB, COUNTRY, COUNTRY_CODE, CARD_EXPIRY_DATE"""
    extracted = {
        "NAME": None,
        "DOB": None,
        "COUNTRY": None,
        "COUNTRY_CODE": None,
        "CARD_EXPIRY_DATE": None
    }

    # Name extraction
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON" and not extracted["NAME"]:
            extracted["NAME"] = ent.text.strip()

    # Date extraction (DOB or Expiry)
    date_matches = re.findall(r'\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b', text)
    for d in date_matches:
        try:
            parsed = parser.parse(d, dayfirst=True)
            # If year <= 2005, treat as DOB, else as card expiry
            if parsed.year <= 2005 and not extracted["DOB"]:
                extracted["DOB"] = parsed.strftime("%Y-%m-%d")
            elif parsed.year > 2005 and not extracted["CARD_EXPIRY_DATE"]:
                extracted["CARD_EXPIRY_DATE"] = parsed.strftime("%Y-%m-%d")
        except:
            continue

    # Country + code
    countries = {c.name: c.alpha_2 for c in pycountry.countries}
    for name, code in countries.items():
        if name.lower() in text.lower():
            extracted["COUNTRY"] = name
            extracted["COUNTRY_CODE"] = code
            break
    if not extracted["COUNTRY"]:
        extracted["COUNTRY"] = "Unknown"
        extracted["COUNTRY_CODE"] = "Unknown"

    return extracted

def compute_risk(entities):
    """Compute Risk Score: 0-100 based on missing fields"""
    missing_fields = 0
    if not entities["NAME"]:
        missing_fields += 1
    if not entities["DOB"]:
        missing_fields += 1
    if not entities["COUNTRY_CODE"] or entities["COUNTRY_CODE"]=="Unknown":
        missing_fields += 1
    if not entities["CARD_EXPIRY_DATE"]:
        missing_fields += 1

    # Simple scoring logic
    score = 25 * missing_fields
    if score > 100:
        score = 100
    return score

def process_file(file_path):
    """Process a single file based on extension"""
    ext = file_path.lower().split('.')[-1]
    if ext == "pdf":
        text = ocr_pdf(file_path)
    elif ext == "docx":
        text = extract_docx(file_path)
    else:
        return None  # unsupported for single file
    entities = extract_entities(text)
    entities["File"] = os.path.basename(file_path)
    entities["Risk_Score"] = compute_risk(entities)
    entities["Status"] = "Flagged" if entities["Risk_Score"]>50 else "Verified"
    entities["Card_Validity"] = "Provided" if entities["CARD_EXPIRY_DATE"] else "Not Provided"
    return entities

def process_zip(zip_path):
    """Process ZIP of PDFs/DOCX"""
    extract_dir = "unzipped_files"
    os.makedirs(extract_dir, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    results = []
    for root, _, files in os.walk(extract_dir):
        for file in files:
            fpath = os.path.join(root, file)
            result = process_file(fpath)
            if result:
                results.append(result)
    return results

def process_folder(folder_path):
    """Process all PDFs/DOCX in a folder"""
    results = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            fpath = os.path.join(root, file)
            result = process_file(fpath)
            if result:
                results.append(result)
    return results

# ---- Colab File Upload ----
uploaded = files.upload()  # Upload PDF/DOCX or ZIP

all_results = []

for fname in uploaded.keys():
    print(f"Processing: {fname}")
    ext = fname.lower().split('.')[-1]
    if ext == "zip":
        res = process_zip(fname)
    elif ext in ["pdf", "docx"]:
        r = process_file(fname)
        res = [r] if r else []
    else:
        print(f"Unsupported file type: {fname}")
        res = []
    all_results.extend(res)

# ---- Save Results ----
df = pd.DataFrame(all_results)
df.to_csv("risk_results.csv", index=False)
print("✅ Results saved to risk_results.csv")
df
