In [17]:
import sys
import subprocess
import asyncio
import json
import joblib
import pandas as pd
import numpy as np
from datetime import datetime
from typing import Dict, Any
from openai import OpenAI
from sklearn.base import BaseEstimator, TransformerMixin

# Function to install missing packages

# Define the DateFeatureExtractor
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    """ Custom transformer for extracting features from date columns. """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['submission_date'] = pd.to_datetime(X['submission_date'])
        X['expense_date'] = pd.to_datetime(X['expense_date'])
        X['submission_delay'] = (X['submission_date'] - X['expense_date']).dt.days
        X['day_of_week'] = X['expense_date'].dt.dayofweek
        X['is_weekend'] = X['day_of_week'].isin([5, 6]).astype(int)
        X['month'] = X['expense_date'].dt.month
        X['quarter'] = X['expense_date'].dt.quarter
        X['is_month_end'] = X['expense_date'].dt.is_month_end.astype(int)
        return X.drop(['submission_date', 'expense_date'], axis=1)

# Define the ExpenseValidationService
class ExpenseValidationService:
    def __init__(self, model_path: str, openai_key: str, assistant_id: str):
        self.models = self._load_models(model_path)
        self.client = OpenAI(api_key=openai_key)
        self.assistant_id = assistant_id
        self.date_extractor = DateFeatureExtractor()
        self.allowed_budgets = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
        self.allowed_categories = {
            "Engineering": ["Travel", "Meals", "Supplies"],
            "IT": ["Travel", "Supplies"],
            "Finance": ["Travel", "Meals"],
            "HR": ["Meals"],
            "Operations": ["Travel", "Meals", "Supplies"],
            "Sales": ["Travel", "Meals"],
            "Marketing": ["Travel", "Meals", "Supplies"]
        }

    def _load_models(self, model_path: str) -> Dict:
        models = {}
        for name in [ 'GradientBoosting', 'XGBoost', 'LogisticRegression']:
            try:
                models[name] = joblib.load(f"{model_path}/{name}_model.joblib")
            except Exception as e:
                print(f"Error loading {name}: {e}")
        return models

    def _preprocess_data(self, df: pd.DataFrame) -> pd.DataFrame:
        return self.date_extractor.transform(df)

    async def validate_tax_compliance(self, bill_data: Dict[str, Any]) -> Dict[str, Any]:
        thread = await self.client.beta.threads.create()
        await self.client.beta.threads.messages.create(thread_id=thread.id, role="user", content=str(bill_data))
        run = await self.client.beta.threads.runs.create(thread_id=thread.id, assistant_id=self.assistant_id)
        messages = await self.client.beta.threads.messages.list(thread_id=thread.id)
        response = messages.data[0].content[0].text.value
        return eval(response)

    def validate_expense(self, expense_data: Dict[str, Any]) -> Dict[str, Any]:
        df = pd.DataFrame([expense_data])
        df_processed = self._preprocess_data(df)
        predictions = {}
        confidences = {}

        for name, model in self.models.items():
            try:
                pred = model.predict(df_processed)[0]
                prob = max(model.predict_proba(df_processed)[0]) * 100
                predictions[name] = "Violation" if pred == 1 else "Normal"
                confidences[name] = round(prob, 2)
            except Exception as e:
                predictions[name] = f"Error: {str(e)}"
                confidences[name] = 0

        compliance_flags = self._check_compliance(expense_data)
        return {
            "ml_predictions": {"predictions": predictions, "confidences": confidences},
            "rule_based_checks": compliance_flags,
            "overall_status": "Violation" if compliance_flags else "Normal"
        }

    def _check_compliance(self, expense: Dict[str, Any]) -> Dict[str, str]:
        flags = {}
        cat = expense.get('category')
        dept = expense.get('department')
        amt = expense.get('amount', 0)
        notes = expense.get('notes', "").strip()

        if cat in self.allowed_budgets and amt > self.allowed_budgets[cat]:
            flags['Over Budget'] = f"Amount {amt} exceeds budget {self.allowed_budgets[cat]} for {cat}"

        if dept in self.allowed_categories and cat not in self.allowed_categories[dept]:
            flags['Unauthorized Category'] = f"Category {cat} not allowed for {dept}"

        if cat in self.allowed_budgets and amt > self.allowed_budgets[cat] and not notes:
            flags['Missing Justification'] = "High amount claimed without justification"

        return flags

    async def validate_expense_and_tax(self, expense_data: Dict[str, Any], bill_data: Dict[str, Any]) -> Dict[str, Any]:
        expense_validation = self.validate_expense(expense_data)
        tax_validation = await self.validate_tax_compliance(bill_data)
        return {
            "expense_validation": expense_validation,
            "tax_validation": tax_validation,
            "overall_status": "Valid" if (expense_validation["overall_status"] == "Normal" 
                                        and tax_validation.get("bill_valid", False)) else "Invalid"
        }

# Define the main function
async def run_validation():
    expense_data = {
        'expense_id': 201,
        'employee_id': 1500,
        'amount': 7500.00,
        'receipt_quality': 0.65,
        'ocr_confidence': 0.90,
        'previous_violations': 1,
        'department': 'Engineering',
        'category': 'Travel',
        'currency': 'INR',
        'vendor_country': 'US',
        'payment_method': 'Credit Card',
        'expense_date': '2024-02-08',
        'submission_date': '2024-02-08',
        'requires_approval': 1,
        'has_receipt': 1,
        'manual_review_required': 0,
        'notes': ''
    }

    bill_data = {
        "invoice_number": "X33",
        "invoice_date": "2024-02-07",
        "due_date": "2024-02-20",
        "seller": {
            "name": "Sleek Bill",
            "address": "XYZ Building, New Delhi, India",
            "gst_number": "27AABQA12S4A1Z5",
            "contact": "+91 9876543210"
        },
        "buyer": {
            "name": "AB Company",
            "address": "ABC Tower, Mumbai, India",
            "gst_number": "27AAAPA1234A1Z5",
            "contact": "+91 9123456789"
        },
        "bill_items": [
            {"description": "Service 1", "hsn_sac": "9983", "quantity": 2, "rate": 5000, "subtotal": 10000, "gst_rate": 18, "gst_amount": 1800, "total": 11800}
        ],
        "subtotal": 10000,
        "gst_total": 1800,
        "grand_total": 11800,
        "payment_terms": "Payment due in 15 days"
    }

    service = ExpenseValidationService("models/", "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", "asst_EpFa1gBPouBslsTcGOsLBEGE")
    result = await service.validate_expense_and_tax(expense_data, bill_data)
    print(json.dumps(result, indent=2))

# Run in Jupyter Notebook
await run_validation()


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Error loading GradientBoosting: node array from the pickle has an incompatible dtype:
- expected: [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]
- got     : {'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', 'missing_go_to_left'], 'formats': ['<i8', '<i8', '<i8', '<f8', '<f8', '<i8', '<f8', 'u1'], 'offsets': [0, 8, 16, 24, 32, 40, 48, 56], 'itemsize': 64}


TypeError: object Thread can't be used in 'await' expression

In [11]:
!pip install scikit-learn==1.0.2  
!pip install scikit-learn==1.2.2  
!pip install scikit-learn==1.3.0  


Collecting scikit-learn==1.0.2
  Downloading scikit-learn-1.0.2.tar.gz (6.7 MB)
     ---------------------------------------- 0.0/6.7 MB ? eta -:--:--
     ---------------------------------------- 0.0/6.7 MB ? eta -:--:--
      --------------------------------------- 0.1/6.7 MB 1.1 MB/s eta 0:00:07
     --- ------------------------------------ 0.5/6.7 MB 4.2 MB/s eta 0:00:02
     --- ------------------------------------ 0.6/6.7 MB 3.7 MB/s eta 0:00:02
     ------- -------------------------------- 1.2/6.7 MB 5.6 MB/s eta 0:00:01
     -------- ------------------------------- 1.4/6.7 MB 6.3 MB/s eta 0:00:01
     ----------- ---------------------------- 2.0/6.7 MB 6.4 MB/s eta 0:00:01
     -------------- ------------------------- 2.4/6.7 MB 7.2 MB/s eta 0:00:01
     ---------------- ----------------------- 2.8/6.7 MB 6.9 MB/s eta 0:00:01
     ------------------ --------------------- 3.1/6.7 MB 7.3 MB/s eta 0:00:01
     ------------------ --------------------- 3.1/6.7 MB 7.3 MB/s eta 0:00:0

  error: subprocess-exited-with-error
  
  Preparing metadata (pyproject.toml) did not run successfully.
  exit code: 1
  
  [1788 lines of output]
  Partial import of sklearn during the build process.
  
    `numpy.distutils` is deprecated since NumPy 1.23.0, as a result
    of the deprecation of `distutils` itself. It will be removed for
    Python >= 3.12. For older Python versions it will remain present.
    It is recommended to use `setuptools < 60.0` for those Python versions.
    For more details, see:
      https://numpy.org/devdocs/reference/distutils_status_migration.html
  
  
    from numpy.distutils.command.build_ext import build_ext  # noqa
  INFO: No module named 'numpy.distutils._msvccompiler' in numpy.distutils; trying from distutils
  INFO: C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.42.34433\bin\HostX86\x64\cl.exe /c /nologo /O2 /W3 /GL /DNDEBUG /MD -IC:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.42.34433\includ

Collecting scikit-learn==1.3.0
  Downloading scikit_learn-1.3.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Downloading scikit_learn-1.3.0-cp311-cp311-win_amd64.whl (9.2 MB)
   ---------------------------------------- 0.0/9.2 MB ? eta -:--:--
   - -------------------------------------- 0.3/9.2 MB 5.9 MB/s eta 0:00:02
   ---- ----------------------------------- 1.0/9.2 MB 10.8 MB/s eta 0:00:01
   ------- -------------------------------- 1.8/9.2 MB 12.7 MB/s eta 0:00:01
   ------------ --------------------------- 2.9/9.2 MB 15.6 MB/s eta 0:00:01
   ------------- -------------------------- 3.1/9.2 MB 13.2 MB/s eta 0:00:01
   ------------- -------------------------- 3.1/9.2 MB 13.2 MB/s eta 0:00:01
   ------------- -------------------------- 3.1/9.2 MB 13.2 MB/s eta 0:00:01
   ------------- -------------------------- 3.1/9.2 MB 13.2 MB/s eta 0:00:01
   ------------- -------------------------- 3.1/9.2 MB 13.2 MB/s eta 0:00:01
   ------------- -------------------------- 3.1/9.2 MB 13.2 MB/s e

In [14]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle

app = Flask(__name__)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = {
    "Engineering": ["Travel", "Meals", "Supplies"],
    "IT": ["Travel", "Supplies"],
    "Finance": ["Travel", "Meals"],
    "HR": ["Meals"],
    "Operations": ["Travel", "Meals", "Supplies"],
    "Sales": ["Travel", "Meals"],
    "Marketing": ["Travel", "Meals", "Supplies"]
}

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            self.load_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)
        with open(MODEL_FILE, 'wb') as f:
            pickle.dump(self.df, f)
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.df = pickle.load(f)
    
    def check_violation(self, expense):
        category = expense.get('category')
        department = expense.get('department')
        amount = expense.get('amount', 0)
        notes = expense.get('notes', "").strip()
        
        violations = []
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category]:
            violations.append(f"Over Budget: {amount} INR exceeds allowed budget of {ALLOWED_BUDGETS[category]} INR.")
        if department in ALLOWED_CATEGORIES and category not in ALLOWED_CATEGORIES[department]:
            violations.append(f"Unauthorized Category: {category} is not allowed for department {department}.")
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category] and notes == "":
            violations.append("Missing Justification: High amount claimed but justification is missing.")
        
        return {
            "expense_id": expense.get('expense_id', "N/A"),
            "employee_id": expense.get('employee_id', "N/A"),
            "category": category,
            "amount": amount,
            "violations": violations if violations else "No Violations"
        }

class SyntheticDataGenerator:
    def __init__(self):
        self.departments = list(ALLOWED_CATEGORIES.keys())
        self.categories = list(ALLOWED_BUDGETS.keys())
        self.currencies = ['INR']
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
    
    def generate_data(self, n_samples=1000, random_state=42):
        np.random.seed(random_state)
        data = {
            'expense_id': np.arange(1, n_samples + 1),
            'employee_id': np.random.randint(1000, 2000, n_samples),
            'amount': np.random.uniform(100, 10000, n_samples).round(2),
            'department': np.random.choice(self.departments, n_samples),
            'category': np.random.choice(self.categories, n_samples),
            'currency': np.random.choice(self.currencies, n_samples),
            'vendor_country': np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1]),
            'payment_method': np.random.choice(self.payment_methods, n_samples),
            'expense_date': [(datetime.today() - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'submission_date': [(datetime.today() - timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'notes': np.random.choice(['Justified', ''], n_samples, p=[0.7, 0.3]),
        }
        
        df = pd.DataFrame(data)
        df['is_violation'] = df.apply(self.rule_based_violation_check, axis=1)
        return df
    
    def rule_based_violation_check(self, row):
        cat = row['category']
        dept = row['department']
        amt = row['amount']
        note = row['notes'].strip()
        if amt > ALLOWED_BUDGETS.get(cat, 0) * 1.5:
            return 1
        if dept in ALLOWED_CATEGORIES and cat not in ALLOWED_CATEGORIES[dept]:
            return 1
        if amt > ALLOWED_BUDGETS.get(cat, 0) and note == "":
            return 1
        return 0

expense_model = ExpenseModel()

@app.route('/generate-data', methods=['GET'])
def generate_data():
    return send_file(DATA_FILE, as_attachment=True)

@app.route('/check-expense', methods=['POST'])
def check_expense():
    data = request.json
    return jsonify(expense_model.check_violation(data))

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=4000, debug=False)



 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:4000
 * Running on http://10.120.97.67:4000
Press CTRL+C to quit
127.0.0.1 - - [09/Feb/2025 01:24:41] "POST /check-expense HTTP/1.1" 200 -
127.0.0.1 - - [09/Feb/2025 01:24:49] "POST /check-expense HTTP/1.1" 200 -


In [21]:
import requests

url = "http://127.0.0.1:4000/check-expense"
data = {
    "expense_id": 201,
    "employee_id": 1500,
    "amount": 750000,
    "receipt_quality": 0.65,
    "ocr_confidence": 0.90,
    "previous_violations": 1,
    "department": "Engineering",
    "category": "Travel",
    "currency": "INR",
    "vendor_country": "US",
    "payment_method": "Credit Card",
    "expense_date": "2024-02-01",
    "submission_date": "2024-02-06",
    "requires_approval": 1,
    "has_receipt": 1,
    "manual_review_required": 0,
    "notes": ""
}

response = requests.post(url, json=data)
print(response.json())  # Print API response


ConnectionError: HTTPConnectionPool(host='127.0.0.1', port=4000): Max retries exceeded with url: /check-expense (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001E8D55E1250>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [12]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

app = Flask(__name__)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = ["Travel", "Meals", "Supplies", "Office Supplies", "Technology", "Training", "Entertainment"]

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            self.load_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)
        
        # Prepare data for training
        features = ['amount', 'receipt_quality', 'ocr_confidence', 'currency', 'vendor_country', 'payment_method']
        X = pd.get_dummies(self.df[features])
        y = self.df['category'].apply(lambda x: 1 if x not in ALLOWED_CATEGORIES else 0)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model = GradientBoostingClassifier(n_estimators=100, random_state=42)
        self.model.fit(X_train, y_train)
        
        with open(MODEL_FILE, 'wb') as f:
            pickle.dump(self.model, f)
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.model = pickle.load(f)
    
    def predict_all(self, expense_df):
        features = ['amount', 'receipt_quality', 'ocr_confidence', 'currency', 'vendor_country', 'payment_method']
        expense_df = pd.get_dummies(expense_df[features]).reindex(columns=self.model.feature_names_in_, fill_value=0)
        
        predictions = {}
        probabilities = {}
        for model_name in ["GradientBoosting Model"]:
            pred = self.model.predict(expense_df)
            prob = self.model.predict_proba(expense_df)
            predictions[model_name] = pred.tolist()
            probabilities[model_name] = prob.tolist()
        return predictions, probabilities

def _check_compliance(expense):
    flags = {}
    cat = expense.get('category')
    amt = expense.get('amount', 0)
    description = expense.get('description', "").strip()
    
    if cat in ALLOWED_BUDGETS and amt > ALLOWED_BUDGETS[cat]:
        flags['Over Budget'] = f"Amount {amt} exceeds budget {ALLOWED_BUDGETS[cat]} for {cat}"
    if cat not in ALLOWED_CATEGORIES:
        flags['Unauthorized Category'] = f"Category {cat} is not authorized"
    if cat in ALLOWED_BUDGETS and amt > ALLOWED_BUDGETS[cat] and not description:
        flags['Missing Justification'] = "High amount claimed without justification"
    
    return flags

@app.route('/check-expense', methods=['POST'])
def check_expense():
    data = request.json
    df_new = pd.DataFrame([data])
    predictions, probabilities = expense_model.predict_all(df_new)
    compliance_flags = _check_compliance(data)
    response = {
        "expense_validation": {
            "ml_predictions": {
                "predictions": {
                    model: "Violation" if pred[0] == 1 else "Normal"
                    for model, pred in predictions.items()
                },
                "confidences": {
                    model: round(max(prob[0]) * 100, 2)
                    for model, prob in probabilities.items()
                }
            },
            "rule_based_checks": compliance_flags if compliance_flags else "No Violations",
            "overall_status": "Violation" if compliance_flags or any(p[0] == 1 for p in predictions.values()) else "Normal"
        }
    }
    return jsonify(response)

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=4000, debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:4000
 * Running on http://10.120.97.67:4000
Press CTRL+C to quit
[2025-02-09 01:23:36,523] ERROR in app: Exception on /check-expense [POST]
Traceback (most recent call last):
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1799, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint])(**view_args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:4000
 * Running on http://10.120.97.67:4000
Press CTRL+C to quit
127.0.0.1 - - [09/Feb/2025 00:32:51] "POST /generate-data HTTP/1.1" 405 -
127.0.0.1 - - [09/Feb/2025 00:32:56] "GET /generate-data HTTP/1.1" 200 -
[2025-02-09 00:33:04,327] ERROR in app: Exception on /check-expense [POST]
Traceback (most recent call last):
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1799, in disp

In [29]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle

app = Flask(__name__)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = {
    "Engineering": ["Travel", "Meals", "Supplies"],
    "IT": ["Travel", "Supplies"],
    "Finance": ["Travel", "Meals"],
    "HR": ["Meals"],
    "Operations": ["Travel", "Meals", "Supplies"],
    "Sales": ["Travel", "Meals"],
    "Marketing": ["Travel", "Meals", "Supplies"]
}

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            self.load_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)
        with open(MODEL_FILE, 'wb') as f:
            pickle.dump(self.df, f)
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.df = pickle.load(f)
    
    def check_violation(self, expense):
        category = expense.get('category')
        department = expense.get('department')
        amount = expense.get('amount', 0)
        notes = expense.get('notes', "").strip()
        
        violations = []
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category]:
            violations.append(f"Over Budget: {amount} INR exceeds allowed budget of {ALLOWED_BUDGETS[category]} INR.")
        if department in ALLOWED_CATEGORIES and category not in ALLOWED_CATEGORIES[department]:
            violations.append(f"Unauthorized Category: {category} is not allowed for department {department}.")
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category] and notes == "":
            violations.append("Missing Justification: High amount claimed but justification is missing.")
        
        return {
            "expense_id": expense.get('expense_id', "N/A"),
            "employee_id": expense.get('employee_id', "N/A"),
            "category": category,
            "amount": amount,
            "violations": violations if violations else "No Violations"
        }

class SyntheticDataGenerator:
    def __init__(self):
        self.departments = list(ALLOWED_CATEGORIES.keys())
        self.categories = list(ALLOWED_BUDGETS.keys())
        self.currencies = ['INR']
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
    
    def generate_data(self, n_samples=1000, random_state=42):
        np.random.seed(random_state)
        data = {
            'expense_id': np.arange(1, n_samples + 1),
            'employee_id': np.random.randint(1000, 2000, n_samples),
            'amount': np.random.uniform(100, 10000, n_samples).round(2),
            'department': np.random.choice(self.departments, n_samples),
            'category': np.random.choice(self.categories, n_samples),
            'currency': np.random.choice(self.currencies, n_samples),
            'vendor_country': np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1]),
            'payment_method': np.random.choice(self.payment_methods, n_samples),
            'expense_date': [(datetime.today() - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'submission_date': [(datetime.today() - timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'description': np.random.choice(['Justified', ''], n_samples, p=[0.7, 0.3]),
        }
        
        df = pd.DataFrame(data)
        df['is_violation'] = df.apply(self.rule_based_violation_check, axis=1)
        return df
    
    def rule_based_violation_check(self, row):
        cat = row['category']
        dept = row['department']
        amt = row['amount']
        note = row['notes'].strip()
        if amt > ALLOWED_BUDGETS.get(cat, 0) * 1.5:
            return 1
        if dept in ALLOWED_CATEGORIES and cat not in ALLOWED_CATEGORIES[dept]:
            return 1
        if amt > ALLOWED_BUDGETS.get(cat, 0) and note == "":
            return 1
        return 0

expense_model = ExpenseModel()

@app.route('/generate-data', methods=['GET'])
def generate_data():
    return send_file(DATA_FILE, as_attachment=True)

@app.route('/check-expense', methods=['POST'])
def check_expense():
    data = request.json
    return jsonify(expense_model.check_violation(data))

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=4000, debug=False)



 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:4000
 * Running on http://10.120.97.67:4000
Press CTRL+C to quit


In [11]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

app = Flask(__name__)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = ["Travel", "Meals", "Supplies", "Office Supplies", "Technology", "Training", "Entertainment"]

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class SyntheticDataGenerator:
    def __init__(self):
        self.categories = list(ALLOWED_BUDGETS.keys())
        self.currencies = ['INR']
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
    
    def generate_data(self, n_samples=1000, random_state=42):
        np.random.seed(random_state)
        data = {
            'expense_id': np.arange(1, n_samples + 1),
            'employee_id': np.random.randint(1000, 2000, n_samples),
            'amount': np.random.uniform(100, 10000, n_samples).round(2),
            'category': np.random.choice(self.categories, n_samples),
            'currency': np.random.choice(self.currencies, n_samples),
            'vendor_country': np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1]),
            'payment_method': np.random.choice(self.payment_methods, n_samples),
            'expense_date': [(datetime.today() - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'submission_date': [(datetime.today() - timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'description': np.random.choice(['Justified', ''], n_samples, p=[0.7, 0.3]),
        }
        df = pd.DataFrame(data)
        df['is_violation'] = df.apply(self.rule_based_violation_check, axis=1)
        return df
    
    def rule_based_violation_check(self, row):
        cat = row['category']
        amt = row['amount']
        desc = row['description'].strip()
        if amt > ALLOWED_BUDGETS.get(cat, 0) * 1.5:
            return 1
        if cat not in ALLOWED_CATEGORIES:
            return 1
        if amt > ALLOWED_BUDGETS.get(cat, 0) and desc == "":
            return 1
        return 0

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            self.load_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)
        
        # Prepare data for training
        features = ['amount', 'currency', 'vendor_country', 'payment_method']
        X = pd.get_dummies(self.df[features])
        y = self.df['category'].apply(lambda x: 1 if x not in ALLOWED_CATEGORIES else 0)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model = GradientBoostingClassifier(n_estimators=100, random_state=42)
        self.model.fit(X_train, y_train)
        with open(MODEL_FILE, 'wb') as f:
          pickle.dump(self.model, f)
        
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.model = pickle.load(f)

    # Ensure the loaded model is a valid scikit-learn model
        if not hasattr(self.model, "predict"):
               raise ValueError("Loaded object is not a trained model. Retrain using /generate-data.")

    # Load synthetic data used during training
        if os.path.exists(DATA_FILE):
                self.df = pd.read_csv(DATA_FILE)
        else:
             raise FileNotFoundError("Training data file not found. Please run /generate-data first.")

    
    def predict_all(self, expense_df):
        features = ['amount', 'currency', 'vendor_country', 'payment_method']
        expense_df = pd.get_dummies(expense_df[features])
        
        # Ensure all required features are present in the input data
        model_features = self.df[features].columns.tolist()
        missing_cols = set(model_features) - set(expense_df.columns)
        for col in missing_cols:
            expense_df[col] = 0
        
        expense_df = expense_df.reindex(columns=model_features, fill_value=0)
        
        predictions = {}
        probabilities = {}
        for model_name in ["GradientBoosting Model"]:
            pred = self.model.predict(expense_df)
            prob = self.model.predict_proba(expense_df)
            predictions[model_name] = pred.tolist()
            probabilities[model_name] = prob.tolist()
        return predictions, probabilities

def _check_compliance(expense):
    flags = {}
    cat = expense.get('category')
    amt = expense.get('amount', 0)
    description = expense.get('description', "").strip()
    
    if cat in ALLOWED_BUDGETS and amt > ALLOWED_BUDGETS[cat]:
        flags['Over Budget'] = f"Amount {amt} exceeds budget {ALLOWED_BUDGETS[cat]} for {cat}"
    if cat not in ALLOWED_CATEGORIES:
        flags['Unauthorized Category'] = f"Category {cat} is not authorized"
    if cat in ALLOWED_BUDGETS and amt > ALLOWED_BUDGETS[cat] and not description:
        flags['Missing Justification'] = "High amount claimed without justification"
    
    return flags

# @app.route('/generate-data', methods=['GET'])
# def generate_data():
#     generator = SyntheticDataGenerator()
#     df = generator.generate_data(n_samples=1000)
#     df.to_csv(DATA_FILE, index=False)
#     return send_file(DATA_FILE, as_attachment=True)

@app.route('/check-expense', methods=['POST'])
def check_expense():
    data = request.json
    df_new = pd.DataFrame([data])
    predictions, probabilities = expense_model.predict_all(df_new)
    compliance_flags = _check_compliance(data)
    response = {
        "expense_validation": {
            "ml_predictions": {
                "predictions": {
                    model: "Violation" if pred[0] == 1 else "Normal"
                    for model, pred in predictions.items()
                },
                "confidences": {
                    model: round(max(prob[0]) * 100, 2)
                    for model, prob in probabilities.items()
                }
            },
            "rule_based_checks": compliance_flags if compliance_flags else "No Violations",
            "overall_status": "Violation" if compliance_flags or any(p[0] == 1 for p in predictions.values()) else "Normal"
        }
    }
    return jsonify(response)

if __name__ == '__main__':
    expense_model = ExpenseModel()
    app.run(host="0.0.0.0", port=4000, debug=False)


ValueError: Loaded object is not a trained model. Retrain using /generate-data.

In [15]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle

app = Flask(__name__)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = ["Travel", "Meals", "Supplies", "Office Supplies", "Technology", "Training", "Entertainment"]

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            self.load_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)
        with open(MODEL_FILE, 'wb') as f:
            pickle.dump(self.df, f)
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.df = pickle.load(f)
    
    def check_violation(self, expense):
        category = expense.get('category')
        amount = expense.get('amount', 0)
        description = expense.get('description', "").strip()
        
        violations = []
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category]:
            violations.append(f"Over Budget: {amount} INR exceeds allowed budget of {ALLOWED_BUDGETS[category]} INR.")
        if category not in ALLOWED_CATEGORIES:
            violations.append(f"Unauthorized Category: {category} is not allowed.")
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category] and description.lower() == "no description":
            violations.append("Missing Justification: High amount claimed but justification is missing.")
        
        return {
            "expense_id": expense.get('expense_id', "N/A"),
            "employee_id": expense.get('employee_id', "N/A"),
            "category": category,
            "amount": amount,
            "violations": violations if violations else "No Violations"
        }

class SyntheticDataGenerator:
    def __init__(self):
        self.categories = list(ALLOWED_BUDGETS.keys())
        self.currencies = ['INR']
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
    
    def generate_data(self, n_samples=1000, random_state=42):
        np.random.seed(random_state)
        data = {
            'expense_id': np.arange(1, n_samples + 1),
            'employee_id': np.random.randint(1000, 2000, n_samples),
            'amount': np.random.uniform(100, 10000, n_samples).round(2),
            'category': np.random.choice(self.categories, n_samples),
            'currency': np.random.choice(self.currencies, n_samples),
            'vendor_country': np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1]),
            'payment_method': np.random.choice(self.payment_methods, n_samples),
            'expense_date': [(datetime.today() - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'submission_date': [(datetime.today() - timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'description': np.random.choice(['Justified', 'no description'], n_samples, p=[0.7, 0.3]),
        }
        
        df = pd.DataFrame(data)
        df['is_violation'] = df.apply(self.rule_based_violation_check, axis=1)
        return df
    
    def rule_based_violation_check(self, row):
        cat = row['category']
        amt = row['amount']
        description = row['description'].strip()
        if amt > ALLOWED_BUDGETS.get(cat, 0) * 1.5:
            return 1
        if cat not in ALLOWED_CATEGORIES:
            return 1
        if amt > ALLOWED_BUDGETS.get(cat, 0) and description.lower() == "no description":
            return 1
        return 0

expense_model = ExpenseModel()

@app.route('/generate-data', methods=['GET'])
def generate_data():
    return send_file(DATA_FILE, as_attachment=True)

@app.route('/check-expense', methods=['POST'])
def check_expense():
    data = request.json
    return jsonify(expense_model.check_violation(data))

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=4000, debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:4000
 * Running on http://10.120.97.67:4000
Press CTRL+C to quit
127.0.0.1 - - [09/Feb/2025 01:27:22] "POST /check-expense HTTP/1.1" 200 -
127.0.0.1 - - [09/Feb/2025 01:27:31] "POST /check-expense HTTP/1.1" 200 -
127.0.0.1 - - [09/Feb/2025 01:27:47] "POST /check-expense HTTP/1.1" 200 -
127.0.0.1 - - [09/Feb/2025 01:28:14] "POST /check-expense HTTP/1.1" 200 -


In [16]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

app = Flask(__name__)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = ["Travel", "Meals", "Supplies", "Office Supplies", "Technology", "Training", "Entertainment"]

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            self.load_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)
        
        # Prepare data for training
        features = ['amount', 'currency', 'vendor_country', 'payment_method']
        X = pd.get_dummies(self.df[features])
        y = self.df['category'].apply(lambda x: 1 if x not in ALLOWED_CATEGORIES else 0)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model = GradientBoostingClassifier(n_estimators=100, random_state=42)
        self.model.fit(X_train, y_train)
        
        with open(MODEL_FILE, 'wb') as f:
            pickle.dump(self.model, f)
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.model = pickle.load(f)
    
    def predict_all(self, expense_df):
        features = ['amount', 'currency', 'vendor_country', 'payment_method']
        expense_df = pd.get_dummies(expense_df[features])
        
        # Ensure all required features are present in the input data
        model_features = getattr(self.model, "feature_names_in_", None)
        if model_features is None:
            model_features = self.df[features].columns.tolist()
        
        missing_cols = set(model_features) - set(expense_df.columns)
        for col in missing_cols:
            expense_df[col] = 0
        
        expense_df = expense_df.reindex(columns=model_features, fill_value=0)
        
        predictions = {}
        probabilities = {}
        for model_name in ["GradientBoosting Model"]:
            pred = self.model.predict(expense_df)
            prob = self.model.predict_proba(expense_df)
            predictions[model_name] = pred.tolist()
            probabilities[model_name] = prob.tolist()
        return predictions, probabilities
    
    def check_violation(self, expense):
        category = expense.get('category')
        amount = expense.get('amount', 0)
        description = expense.get('description', "").strip()
        
        violations = []
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category]:
            violations.append(f"Over Budget: {amount} INR exceeds allowed budget of {ALLOWED_BUDGETS[category]} INR.")
        if category not in ALLOWED_CATEGORIES:
            violations.append(f"Unauthorized Category: {category} is not allowed.")
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category] and description.lower() == "no description":
            violations.append("Missing Justification: High amount claimed but justification is missing.")
        
        return {
            "expense_id": expense.get('expense_id', "N/A"),
            "employee_id": expense.get('employee_id', "N/A"),
            "category": category,
            "amount": amount,
            "violations": violations if violations else "No Violations"
        }

class SyntheticDataGenerator:
    def __init__(self):
        self.categories = list(ALLOWED_BUDGETS.keys())
        self.currencies = ['INR']
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
    
    def generate_data(self, n_samples=1000, random_state=42):
        np.random.seed(random_state)
        data = {
            'expense_id': np.arange(1, n_samples + 1),
            'employee_id': np.random.randint(1000, 2000, n_samples),
            'amount': np.random.uniform(100, 10000, n_samples).round(2),
            'category': np.random.choice(self.categories, n_samples),
            'currency': np.random.choice(self.currencies, n_samples),
            'vendor_country': np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1]),
            'payment_method': np.random.choice(self.payment_methods, n_samples),
            'expense_date': [(datetime.today() - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'submission_date': [(datetime.today() - timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'description': np.random.choice(['Justified', 'no description'], n_samples, p=[0.7, 0.3]),
        }
        
        df = pd.DataFrame(data)
        df['is_violation'] = df.apply(self.rule_based_violation_check, axis=1)
        return df
    
    def rule_based_violation_check(self, row):
        cat = row['category']
        amt = row['amount']
        description = row['description'].strip()
        if amt > ALLOWED_BUDGETS.get(cat, 0) * 1.5:
            return 1
        if cat not in ALLOWED_CATEGORIES:
            return 1
        if amt > ALLOWED_BUDGETS.get(cat, 0) and description.lower() == "no description":
            return 1
        return 0

expense_model = ExpenseModel()

@app.route('/generate-data', methods=['GET'])
def generate_data():
    return send_file(DATA_FILE, as_attachment=True)

@app.route('/check-expense', methods=['POST'])
def check_expense():
    data = request.json
    df_new = pd.DataFrame([data])
    predictions, probabilities = expense_model.predict_all(df_new)
    compliance_flags = expense_model.check_violation(data)
    response = {
        "expense_validation": {
            "ml_predictions": predictions,
            "probabilities": probabilities,
            "rule_based_checks": compliance_flags
        }
    }
    return jsonify(response)

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=4000, debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:4000
 * Running on http://10.120.97.67:4000
Press CTRL+C to quit
127.0.0.1 - - [09/Feb/2025 01:30:07] "GET /generate-data HTTP/1.1" 200 -
[2025-02-09 01:30:09,927] ERROR in app: Exception on /check-expense [POST]
Traceback (most recent call last):
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1799, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint

In [17]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

app = Flask(__name__)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = ["Travel", "Meals", "Supplies", "Office Supplies", "Technology", "Training", "Entertainment"]

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            try:
                self.load_model()
                self.df = pd.read_csv(DATA_FILE)  # Ensure self.df exists after loading
            except Exception as e:
                print(f"Error loading model: {e}. Retraining model...")
                self.train_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)  # Save dataset for later use
        self.feature_columns = list(pd.get_dummies(self.df[['amount', 'currency', 'vendor_country', 'payment_method']]).columns)  # Store features
        
        # Prepare data for training
        features = ['amount', 'currency', 'vendor_country', 'payment_method']
        X = pd.get_dummies(self.df[features]).reindex(columns=self.feature_columns, fill_value=0)
        y = self.df['category'].apply(lambda x: 1 if x not in ALLOWED_CATEGORIES else 0)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model = GradientBoostingClassifier(n_estimators=100, random_state=42)
        self.model.fit(X_train, y_train)
        
        with open(MODEL_FILE, 'wb') as f:
            pickle.dump(self.model, f)
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.model = pickle.load(f)
        self.df = pd.read_csv(DATA_FILE)  # Ensure self.df exists after loading
    
    def predict_all(self, expense_df):
        features = ['amount', 'currency', 'vendor_country', 'payment_method']
        expense_df = pd.get_dummies(expense_df[features])
        
        if not hasattr(self, "feature_columns"):
            self.feature_columns = list(pd.get_dummies(self.df[['amount', 'currency', 'vendor_country', 'payment_method']]).columns)
        
        model_features = self.feature_columns
        missing_cols = set(model_features) - set(expense_df.columns)
        for col in missing_cols:
            expense_df[col] = 0
        
        expense_df = expense_df.reindex(columns=model_features, fill_value=0)
        
        predictions = {}
        probabilities = {}
        for model_name in ["GradientBoosting Model"]:
            pred = self.model.predict(expense_df)
            prob = self.model.predict_proba(expense_df)
            predictions[model_name] = pred.tolist()
            probabilities[model_name] = prob.tolist()
        return predictions, probabilities
    
    def check_violation(self, expense):
        category = expense.get('category')
        amount = expense.get('amount', 0)
        description = expense.get('description', "").strip()
        
        violations = []
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category]:
            violations.append(f"Over Budget: {amount} INR exceeds allowed budget of {ALLOWED_BUDGETS[category]} INR.")
        if category not in ALLOWED_CATEGORIES:
            violations.append(f"Unauthorized Category: {category} is not allowed.")
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category] and description.lower() == "no description":
            violations.append("Missing Justification: High amount claimed but justification is missing.")
        
        return {
            "expense_id": expense.get('expense_id', "N/A"),
            "employee_id": expense.get('employee_id', "N/A"),
            "category": category,
            "amount": amount,
            "violations": violations if violations else "No Violations"
        }

class SyntheticDataGenerator:
    def __init__(self):
        self.categories = list(ALLOWED_BUDGETS.keys())
        self.currencies = ['INR']
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
    
    def generate_data(self, n_samples=1000, random_state=42):
        np.random.seed(random_state)
        data = {
            'expense_id': np.arange(1, n_samples + 1),
            'employee_id': np.random.randint(1000, 2000, n_samples),
            'amount': np.random.uniform(100, 10000, n_samples).round(2),
            'category': np.random.choice(self.categories, n_samples),
            'currency': np.random.choice(self.currencies, n_samples),
            'vendor_country': np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1]),
            'payment_method': np.random.choice(self.payment_methods, n_samples),
            'expense_date': [(datetime.today() - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'submission_date': [(datetime.today() - timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'description': np.random.choice(['Justified', 'no description'], n_samples, p=[0.7, 0.3]),
        }
        
        df = pd.DataFrame(data)
        df['is_violation'] = df.apply(self.rule_based_violation_check, axis=1)
        return df

expense_model = ExpenseModel()

@app.route('/generate-data', methods=['GET'])
def generate_data():
    return send_file(DATA_FILE, as_attachment=True)

@app.route('/check-expense', methods=['POST'])
def check_expense():
    data = request.json
    df_new = pd.DataFrame([data])
    predictions, probabilities = expense_model.predict_all(df_new)
    compliance_flags = expense_model.check_violation(data)
    response = {
        "expense_validation": {
            "ml_predictions": predictions,
            "probabilities": probabilities,
            "rule_based_checks": compliance_flags
        }
    }
    return jsonify(response)

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=4000, debug=False)

 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:4000
 * Running on http://10.120.97.67:4000
Press CTRL+C to quit
127.0.0.1 - - [09/Feb/2025 01:32:30] "GET /generate-data HTTP/1.1" 200 -
[2025-02-09 01:32:34,681] ERROR in app: Exception on /check-expense [POST]
Traceback (most recent call last):
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 2529, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1825, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1823, in full_dispatch_request
    rv = self.dispatch_request()
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\nitin\anaconda3\Lib\site-packages\flask\app.py", line 1799, in dispatch_request
    return self.ensure_sync(self.view_functions[rule.endpoint

In [20]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

app = Flask(__name__)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = ["Travel", "Meals", "Supplies", "Office Supplies", "Technology", "Training", "Entertainment"]

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            try:
                self.load_model()
                self.df = pd.read_csv(DATA_FILE)  # Ensure self.df exists after loading
            except Exception as e:
                print(f"Error loading model: {e}. Retraining model...")
                self.train_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)  # Save dataset for later use
        self.feature_columns = list(pd.get_dummies(self.df[['amount', 'currency', 'vendor_country', 'payment_method']]).columns)  # Store features
        
        # Prepare data for training
        features = ['amount', 'currency', 'vendor_country', 'payment_method']
        X = pd.get_dummies(self.df[features]).reindex(columns=self.feature_columns, fill_value=0)
        y = self.df['category'].apply(lambda x: 1 if x not in ALLOWED_CATEGORIES else 0)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        self.model = GradientBoostingClassifier(n_estimators=100, random_state=42)
        self.model.fit(X_train, y_train)
        
        with open(MODEL_FILE, 'wb') as f:
            pickle.dump(self.model, f)
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.model = pickle.load(f)
        self.df = pd.read_csv(DATA_FILE)  # Ensure self.df exists after loading
    
    def predict_all(self, expense_df):
        features = ['amount', 'currency', 'vendor_country', 'payment_method']
        expense_df = pd.get_dummies(expense_df[features])
        
        if not hasattr(self, "feature_columns"):
            self.feature_columns = list(pd.get_dummies(self.df[['amount', 'currency', 'vendor_country', 'payment_method']]).columns)
        
        model_features = self.feature_columns
        missing_cols = set(model_features) - set(expense_df.columns)
        for col in missing_cols:
            expense_df[col] = 0
        
        expense_df = expense_df.reindex(columns=model_features, fill_value=0)
        
        predictions = {}
        probabilities = {}
        for model_name in ["GradientBoosting Model"]:
            pred = self.model.predict(expense_df)
            prob = self.model.predict_proba(expense_df)
            predictions[model_name] = pred.tolist()
            probabilities[model_name] = prob.tolist()
        return predictions, probabilities
    
    def check_violation(self, expense):
        category = expense.get('category')
        amount = expense.get('amount', 0)
        description = expense.get('description', "").strip()
        
        violations = []
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category]:
            violations.append(f"Over Budget: {amount} INR exceeds allowed budget of {ALLOWED_BUDGETS[category]} INR.")
        if category not in ALLOWED_CATEGORIES:
            violations.append(f"Unauthorized Category: {category} is not allowed.")
        if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category] and description.lower() == "no description":
            violations.append("Missing Justification: High amount claimed but justification is missing.")
        
        return {
            "expense_id": expense.get('expense_id', "N/A"),
            "employee_id": expense.get('employee_id', "N/A"),
            "category": category,
            "amount": amount,
            "violations": violations if violations else "No Violations"
        }

class SyntheticDataGenerator:
    def __init__(self):
        self.categories = list(ALLOWED_BUDGETS.keys())
        self.currencies = ['INR']
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
    
    def generate_data(self, n_samples=1000, random_state=42):
        np.random.seed(random_state)
        data = {
            'expense_id': np.arange(1, n_samples + 1),
            'employee_id': np.random.randint(1000, 2000, n_samples),
            'amount': np.random.uniform(100, 10000, n_samples).round(2),
            'category': np.random.choice(self.categories, n_samples),
            'currency': np.random.choice(self.currencies, n_samples),
            'vendor_country': np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1]),
            'payment_method': np.random.choice(self.payment_methods, n_samples),
            'expense_date': [(datetime.today() - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'submission_date': [(datetime.today() - timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'description': np.random.choice(['Justified', 'no description'], n_samples, p=[0.7, 0.3]),
        }
        
        df = pd.DataFrame(data)
        df['is_violation'] = df.apply(self.rule_based_violation_check, axis=1)
        return df

expense_model = ExpenseModel()

@app.route('/generate-data', methods=['GET'])
def generate_data():
    return send_file(DATA_FILE, as_attachment=True)

@app.route('/check-expense', methods=['POST'])
def check_expense():
    data = request.json
    df_new = pd.DataFrame([data])
    predictions, probabilities = expense_model.predict_all(df_new)
    compliance_flags = expense_model.check_violation(data)
    response = {
        "expense_validation": {
            "ml_predictions": predictions,
            "probabilities": probabilities,
            "rule_based_checks": compliance_flags
        }
    }
    return jsonify(response)

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=4000, debug=False)


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 37)

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 37)

In [4]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle

app = Flask(__name__)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = ["Travel", "Meals", "Supplies", "Office Supplies", "Technology", "Training", "Entertainment"]

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            self.load_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)
        with open(MODEL_FILE, 'wb') as f:
            pickle.dump(self.df, f)
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.df = pickle.load(f)
    def check_violation(self, expense):
           category = expense.get("category")
           amount = expense.get("amount", 0)
          description = expense.get("description", "").strip()
    
          violations = []
            if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category]:
                  violations.append(f"Over Budget: {amount} INR exceeds allowed budget of {ALLOWED_BUDGETS[category]} INR.")
            if category not in ALLOWED_CATEGORIES:
                   violations.append(f"Unauthorized Category: {category} is not allowed.")
            if category in ALLOWED_BUDGETS and amount > ALLOWED_BUDGETS[category] and description.lower() == "no description":
                 violations.append("Missing Justification: High amount claimed but justification is missing.")

    # Create the validation result
           violation_data = {
                    "expense_id": expense.get("expense_id", "N/A"),
                   "employee_id": expense.get("employee_id", "N/A"),
                    "category": category,
                     "amount": amount,
                    "violations": violations if violations else "No Violations",
                        }

    # Append the validation data to the original expense dictionary
                   expense.update(violation_data)  # This ensures it is added to the same dictionary

              return violation_data  # You can still return it for debugging or logging purposes


class SyntheticDataGenerator:
    def __init__(self):
        self.categories = list(ALLOWED_BUDGETS.keys())
        self.currencies = ['INR']
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
    
    def generate_data(self, n_samples=1000, random_state=42):
        np.random.seed(random_state)
        data = {
            'expense_id': np.arange(1, n_samples + 1),
            'employee_id': np.random.randint(1000, 2000, n_samples),
            'amount': np.random.uniform(100, 10000, n_samples).round(2),
            'category': np.random.choice(self.categories, n_samples),
            'currency': np.random.choice(self.currencies, n_samples),
            'vendor_country': np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1]),
            'payment_method': np.random.choice(self.payment_methods, n_samples),
            'expense_date': [(datetime.today() - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'submission_date': [(datetime.today() - timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'description': np.random.choice(['Justified', 'no description'], n_samples, p=[0.7, 0.3]),
        }
        
        df = pd.DataFrame(data)
        df['is_violation'] = df.apply(self.rule_based_violation_check, axis=1)
        return df
    
    def rule_based_violation_check(self, row):
        cat = row['category']
        amt = row['amount']
        description = row['description'].strip()
        if amt > ALLOWED_BUDGETS.get(cat, 0) * 1.5:
            return 1
        if cat not in ALLOWED_CATEGORIES:
            return 1
        if amt > ALLOWED_BUDGETS.get(cat, 0) and description.lower() == "no description":
            return 1
        return 0

expense_model = ExpenseModel()

@app.route('/generate-data', methods=['GET'])
def generate_data():
    return send_file(DATA_FILE, as_attachment=True)

@app.route('/check-expense', methods=['POST'])
def check_expense():
    data = request.json
    return jsonify(expense_model.check_violation(data))

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=4000, debug=False)


IndentationError: unindent does not match any outer indentation level (<tokenize>, line 37)

In [None]:
from flask import Flask, request, jsonify, send_file
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import pickle
import openai
from threading import Thread
from loguru import logger
import time



app = Flask(__name__)


API_KEY = "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"
ASSISTANT_ID = "asst_EpFa1gBPouBslsTcGOsLBEGE"
client = openai.OpenAI(api_key=API_KEY)

# Define allowed budgets and categories
ALLOWED_BUDGETS = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
ALLOWED_CATEGORIES = ["Travel", "Meals", "Supplies", "Office Supplies", "Technology", "Training", "Entertainment"]

MODEL_FILE = "expense_model.pkl"
DATA_FILE = "synthetic_expense_data.csv"

class ExpenseModel:
    def __init__(self):
        if os.path.exists(MODEL_FILE):
            self.load_model()
        else:
            self.train_model()
    
    def train_model(self):
        generator = SyntheticDataGenerator()
        self.df = generator.generate_data(n_samples=1000)
        self.df.to_csv(DATA_FILE, index=False)
        with open(MODEL_FILE, 'wb') as f:
            pickle.dump(self.df, f)
    
    def load_model(self):
        with open(MODEL_FILE, 'rb') as f:
            self.df = pickle.load(f)
    def check_violation(self, expense):
           category = expense.get("category")
           amount = expense.get("amount", 0)
           description = expense.get("description", "").strip()
    
           violations = []
           if category in ALLOWED_BUDGETS and float(amount) > float(ALLOWED_BUDGETS[category]):
                  violations.append(f"Over Budget: {amount} INR exceeds allowed budget of {ALLOWED_BUDGETS[category]} INR.")
           if category not in ALLOWED_CATEGORIES:
                   violations.append(f"Unauthorized Category: {category} is not allowed.")
           if category in ALLOWED_BUDGETS and float(amount) > float(ALLOWED_BUDGETS[category]) and description.lower() == "no description":
                 violations.append("Missing Justification: High amount claimed but justification is missing.")

    # Create the validation result
           violation_data = {
                    "expense_id": expense.get("expense_id", "N/A"),
                   "employee_id": expense.get("employee_id", "N/A"),
                    "category": category,
                     "amount": amount,
                    "violations": violations if violations else "No Violations",
                        }

    # Append the validation data to the original expense dictionary
           expense.update(violation_data)  # This ensures it is added to the same dictionary

           return expense  # You can still return it for debugging or logging purposes


class SyntheticDataGenerator:
    def __init__(self):
        self.categories = list(ALLOWED_BUDGETS.keys())
        self.currencies = ['INR']
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
    
    def generate_data(self, n_samples=1000, random_state=42):
        np.random.seed(random_state)
        data = {
            'expense_id': np.arange(1, n_samples + 1),
            'employee_id': np.random.randint(1000, 2000, n_samples),
            'amount': np.random.uniform(100, 10000, n_samples).round(2),
            'category': np.random.choice(self.categories, n_samples),
            'currency': np.random.choice(self.currencies, n_samples),
            'vendor_country': np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1]),
            'payment_method': np.random.choice(self.payment_methods, n_samples),
            'expense_date': [(datetime.today() - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'submission_date': [(datetime.today() - timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d') for _ in range(n_samples)],
            'description': np.random.choice(['Justified', 'no description'], n_samples, p=[0.7, 0.3]),
        }
        
        df = pd.DataFrame(data)
        df['is_violation'] = df.apply(self.rule_based_violation_check, axis=1)
        return df
    
    def rule_based_violation_check(self, row):
        cat = row['category']
        amt = row['amount']
        description = row['description'].strip()
        if amt > ALLOWED_BUDGETS.get(cat, 0) * 1.5:
            return 1
        if cat not in ALLOWED_CATEGORIES:
            return 1
        if amt > ALLOWED_BUDGETS.get(cat, 0) and description.lower() == "no description":
            return 1
        return 0

expense_model = ExpenseModel()



def validate_invoice(json_data):
    try:
        # Input validation
        if not json_data:
            logger.error("❌ No invoice data provided")
            return None, "No invoice data provided"

        # Create a new thread with the OpenAI Assistant
        try:
            thread = client.beta.threads.create()
            logger.info(f"✅ Created thread: {thread.id}")
        except Exception as e:
            logger.error(f"❌ Failed to create thread: {str(e)}")
            return None, "Failed to create OpenAI thread"

        # Send the invoice data to the assistant
        try:
            message = client.beta.threads.messages.create(
                thread_id=thread.id,
                role="user",
                content=str(json_data)  # Convert to string to ensure safe transmission
            )
            logger.info("✅ Sent message to thread")
        except Exception as e:
            logger.error(f"❌ Failed to send message: {str(e)}")
            return None, "Failed to send message to OpenAI"

        # Run the assistant on this thread
        try:
            run = client.beta.threads.runs.create(
                thread_id=thread.id,
                assistant_id=ASSISTANT_ID
            )
            logger.info(f"✅ Started run: {run.id}")
        except Exception as e:
            logger.error(f"❌ Failed to start run: {str(e)}")
            return None, "Failed to start OpenAI assistant"

        # Wait for the assistant to process the request with timeout
        max_retries = 30  # 30 seconds timeout
        retry_count = 0
        
        while retry_count < max_retries:
            try:
                run_status = client.beta.threads.runs.retrieve(
                    thread_id=thread.id,
                    run_id=run.id
                )
                if run_status.status == "completed":
                    break
                elif run_status.status == "failed":
                    logger.error("❌ Assistant run failed")
                    return None, "OpenAI assistant run failed"
                elif run_status.status == "expired":
                    logger.error("❌ Assistant run expired")
                    return None, "OpenAI assistant run expired"
                
                retry_count += 1
                time.sleep(1)  # Wait 1 second before checking again
                
            except Exception as e:
                logger.error(f"❌ Failed to check run status: {str(e)}")
                return None, "Failed to check OpenAI assistant status"

        if retry_count >= max_retries:
            logger.error("❌ Assistant run timed out")
            return None, "OpenAI assistant timed out"

        # Retrieve the final message
        try:
            messages = client.beta.threads.messages.list(thread_id=thread.id)
            if not messages.data:
                logger.error("❌ No response from assistant")
                return None, "No response from OpenAI assistant"
                
            reply = messages.data[0].content[0].text.value
            logger.info("✅ Received validation response")
            
            # Update the original data with validation result
            json_data["validation_result"] = reply
            
            return json_data, None  # Return (data, error)
            
        except Exception as e:
            logger.error(f"❌ Failed to retrieve assistant response: {str(e)}")
            return None, "Failed to get OpenAI assistant response"

    except Exception as e:
        logger.error(f"❌ Unexpected error in validate_invoice: {str(e)}")
        return None, f"Unexpected error: {str(e)}"


@app.route('/generate-data', methods=['GET'])
def generate_data():
    return send_file(DATA_FILE, as_attachment=True)

@app.route('/check-expense', methods=['POST'])
def check_expense():
    try:
        data = request.json
        if not data:
            return jsonify({"error": "No data provided"}), 400

        # Check violations
        model_data = expense_model.check_violation(data)
        if not model_data:
            return jsonify({"error": "Violation check failed"}), 400

        # Validate invoice
        validated_data, error = validate_invoice(model_data)
        if error:
            return jsonify({"error": error}), 400

        if not validated_data:
            return jsonify({"error": "Validation failed"}), 400

        # Return successful response
        return jsonify(validated_data), 200

    except Exception as e:
        logger.error(f"❌ Error in check_expense: {str(e)}")
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(host="0.0.0.0", port=4000, debug=False)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:4000
 * Running on http://10.120.97.67:4000
 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:4000
 * Running on http://10.120.97.67:4000
Press CTRL+C to quit
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads "HTTP/1.1 200 OK"
[32m2025-02-09 07:58:27.443[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_invoice[0m:[36m124[0m - [1m✅ Created thread: thread_uf7RO5NrvSqG9dTMs6Ce70L1[0m
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads/thread_uf7RO5NrvSqG9dTMs6Ce70L1/messages "HTTP/1.1 200 OK"
[32m2025-02-09 07:58:27.852[0m | [1mINFO    [0m | [36m__main__[0m:[36mvalidate_invoice[0m:[36m136[0m - [1m✅ Sent message to thread[0m
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/threads/thread_uf7RO5NrvSqG9dTMs6Ce70L1/runs "HTTP/1.1 200 OK"
[32m2025-02-09 07:58:28.950[0m | [1mINFO    [0m | [36m__main__

In [46]:
!pip install loguru

Collecting loguru
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting win32-setctime>=1.0.0 (from loguru)
  Downloading win32_setctime-1.2.0-py3-none-any.whl.metadata (2.4 kB)
Downloading loguru-0.7.3-py3-none-any.whl (61 kB)
   ---------------------------------------- 0.0/61.6 kB ? eta -:--:--
   ---------------------------------------- 61.6/61.6 kB 1.7 MB/s eta 0:00:00
Downloading win32_setctime-1.2.0-py3-none-any.whl (4.1 kB)
Installing collected packages: win32-setctime, loguru
Successfully installed loguru-0.7.3 win32-setctime-1.2.0


In [None]:
from flask import Flask, request, jsonify
import openai
import os
from threading import Thread

# Flask app initialization
app = Flask(__name__)

# OpenAI API Setup (Hardcoded API Key and Assistant ID)
API_KEY = "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"
ASSISTANT_ID = "asst_EpFa1gBPouBslsTcGOsLBEGE"

client = openai.OpenAI(api_key=API_KEY)

@app.route('/validate_invoice', methods=['POST'])
def validate_invoice():
    try:
        # Get JSON data from request
        invoice_data = request.json

        if not invoice_data:
            return jsonify({"error": "No invoice data provided"}), 400

        # Create a new thread with the OpenAI Assistant
        thread = client.beta.threads.create()
        
        # Send the invoice data to the assistant
        client.beta.threads.messages.create(
            thread_id=thread.id,
            role="user",
            content=f"{invoice_data}"  # Sending raw JSON invoice data
        )

        # Run the assistant on this thread
        run = client.beta.threads.runs.create(
            thread_id=thread.id,
            assistant_id=ASSISTANT_ID
        )

        # Wait for the assistant to process the request
        while True:
            run_status = client.beta.threads.runs.retrieve(
                thread_id=thread.id,
                run_id=run.id
            )
            if run_status.status == "completed":
                break

        # Retrieve the final message from the assistant
        messages = client.beta.threads.messages.list(thread_id=thread.id)
        reply = messages.data[0].content[0].text.value  # Extract response

        return jsonify({"validation_result": reply})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

# Function to run Flask without conflicts in Jupyter Notebook
def run_flask():
    app.run(debug=True, host='0.0.0.0', port=5000, use_reloader=False)

# Run Flask in a separate thread (so Jupyter doesn't crash)
flask_thread = Thread(target=run_flask)
flask_thread.start()