In [1]:
# %% Cell 1: Synthetic Data Generation

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

class SyntheticDataGenerator:
    def __init__(self):
        # Define Indian context parameters:
        self.departments = ['Engineering', 'IT', 'Finance', 'HR', 'Operations', 'Sales', 'Marketing']
        self.categories = ['Travel', 'Meals', 'Supplies']
        self.currencies = ['INR']  # Only INR in this case
        self.payment_methods = ['Credit Card', 'Cash', 'Net Banking', 'UPI']
        # Rule-based parameters:
        self.allowed_budgets = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
        self.allowed_categories = {
            "Engineering": ["Travel", "Meals", "Supplies"],
            "IT": ["Travel", "Supplies"],
            "Finance": ["Travel", "Meals"],
            "HR": ["Meals"],
            "Operations": ["Travel", "Meals", "Supplies"],
            "Sales": ["Travel", "Meals"],
            "Marketing": ["Travel", "Meals", "Supplies"]
        }
    
    def generate_data(self, n_samples=200, random_state=42):
        np.random.seed(random_state)
        data = {}
        # Basic IDs
        data['expense_id'] = np.arange(1, n_samples + 1)
        data['employee_id'] = np.random.randint(1000, 2000, n_samples)
        
        # Numeric features (amount between 100 and 10000 INR, with some outliers)
        amounts = np.random.uniform(100, 10000, n_samples).round(2)
        # Introduce some outliers randomly:
        outlier_indices = np.random.choice(n_samples, size=int(0.05*n_samples), replace=False)
        amounts[outlier_indices] *= np.random.uniform(5, 20, size=len(outlier_indices))
        data['amount'] = amounts
        
        # Other numeric features
        data['receipt_quality'] = np.random.uniform(0, 1, n_samples).round(2)
        data['ocr_confidence'] = np.random.uniform(0, 1, n_samples).round(2)
        data['previous_violations'] = np.random.randint(0, 5, n_samples)
        
        # Categorical features
        data['department'] = np.random.choice(self.departments, n_samples)
        data['category'] = np.random.choice(self.categories, n_samples)
        data['currency'] = np.random.choice(self.currencies, n_samples)
        data['vendor_country'] = np.random.choice(['IN', 'US', 'UK'], n_samples, p=[0.8, 0.1, 0.1])
        data['payment_method'] = np.random.choice(self.payment_methods, n_samples)
        
        # Date features
        base_date = datetime.today()
        expense_dates = [(base_date - timedelta(days=np.random.randint(1, 30))).strftime('%Y-%m-%d')
                         for _ in range(n_samples)]
        data['expense_date'] = expense_dates
        data['submission_date'] = [(datetime.strptime(ed, '%Y-%m-%d') + timedelta(days=np.random.randint(0, 10))).strftime('%Y-%m-%d')
                                   for ed in expense_dates]
        
        # Binary features
        data['requires_approval'] = np.random.choice([0, 1], n_samples)
        data['has_receipt'] = np.random.choice([0, 1], n_samples)
        data['manual_review_required'] = np.random.choice([0, 1], n_samples)
        
        # Extra non-feature fields (for reporting but not used in ML features)
        data['vendor_id'] = np.random.randint(2000, 3000, n_samples)
        data['vendor_name'] = np.random.choice(['VendorA', 'VendorB', 'VendorC'], n_samples)
        data['notes'] = np.random.choice(['Justified', ''], n_samples, p=[0.7, 0.3])
        data['approval_status'] = np.random.choice(['Approved', 'Rejected'], n_samples)
        data['approval_date'] = [base_date.strftime('%Y-%m-%d')] * n_samples

        # Target generation (is_violation) using rule-based logic:
        is_violation = []
        for i in range(n_samples):
            violation_flag = 0
            cat = data['category'][i]
            dept = data['department'][i]
            amt = data['amount'][i]
            note = data['notes'][i]
            
            # Overbudget check: if amount exceeds allowed budget (we allow a 1.5x buffer)
            if amt > self.allowed_budgets[cat] * 1.5:
                violation_flag = 1
            # Unauthorized category check: if the expense category is not allowed for the department
            elif dept in self.allowed_categories and cat not in self.allowed_categories[dept]:
                violation_flag = 1
            # Missing justification: if amount is high (over budget) and notes are empty
            elif amt > self.allowed_budgets[cat] and note.strip() == "":
                violation_flag = 1
            is_violation.append(violation_flag)
        data['is_violation'] = is_violation
        
        df = pd.DataFrame(data)
        return df

# Generate the synthetic data and (optionally) save to CSV
generator = SyntheticDataGenerator()
df_synthetic = generator.generate_data(n_samples=10000)
print("Synthetic Data Sample:")
df_synthetic.info()
df_synthetic.to_csv('synthetic_expense_openai_data.csv', index=False)

Synthetic Data Sample:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   expense_id              10000 non-null  int32  
 1   employee_id             10000 non-null  int32  
 2   amount                  10000 non-null  float64
 3   receipt_quality         10000 non-null  float64
 4   ocr_confidence          10000 non-null  float64
 5   previous_violations     10000 non-null  int32  
 6   department              10000 non-null  object 
 7   category                10000 non-null  object 
 8   currency                10000 non-null  object 
 9   vendor_country          10000 non-null  object 
 10  payment_method          10000 non-null  object 
 11  expense_date            10000 non-null  object 
 12  submission_date         10000 non-null  object 
 13  requires_approval       10000 non-null  int32  
 14  has_receipt     

In [2]:
df_synthetic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   expense_id              10000 non-null  int32  
 1   employee_id             10000 non-null  int32  
 2   amount                  10000 non-null  float64
 3   receipt_quality         10000 non-null  float64
 4   ocr_confidence          10000 non-null  float64
 5   previous_violations     10000 non-null  int32  
 6   department              10000 non-null  object 
 7   category                10000 non-null  object 
 8   currency                10000 non-null  object 
 9   vendor_country          10000 non-null  object 
 10  payment_method          10000 non-null  object 
 11  expense_date            10000 non-null  object 
 12  submission_date         10000 non-null  object 
 13  requires_approval       10000 non-null  int32  
 14  has_receipt             10000 non-null 

In [3]:
# %% Cell 2: Model Training

import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin
import joblib

# Custom Transformer for Date Features (as before)
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        X['submission_date'] = pd.to_datetime(X['submission_date'])
        X['expense_date'] = pd.to_datetime(X['expense_date'])
        X['submission_delay'] = (X['submission_date'] - X['expense_date']).dt.days
        X['day_of_week'] = X['expense_date'].dt.dayofweek
        X['is_weekend'] = X['day_of_week'].isin([5,6]).astype(int)
        X['month'] = X['expense_date'].dt.month
        X['quarter'] = X['expense_date'].dt.quarter
        X['is_month_end'] = X['expense_date'].dt.is_month_end.astype(int)
        return X.drop(['submission_date', 'expense_date'], axis=1)

# Expense Compliance Model class: trains several models
class ExpenseComplianceModel:
    def __init__(self):
        # We'll use only the columns that our new expense sample uses:
        self.numeric_features = ['amount', 'receipt_quality', 'ocr_confidence', 'previous_violations']
        self.categorical_features = ['department', 'category', 'currency', 'vendor_country', 'payment_method']
        self.date_features = ['expense_date', 'submission_date']
        self.binary_features = ['requires_approval', 'has_receipt', 'manual_review_required']
        self.models = {}
        self.preprocessor = None
        self.feature_importances_ = {}
    
    def prepare_data(self, df):
        df = df.copy()
        # Only keep the required columns (plus target 'is_violation' if exists)
        required = (self.numeric_features + self.categorical_features +
                    self.date_features + self.binary_features)
        if 'is_violation' in df.columns:
            required.append('is_violation')
        df = df[ [col for col in required if col in df.columns] ]
        # Create the preprocessor pipeline
        self.preprocessor = ColumnTransformer(transformers=[
            ('num', StandardScaler(), self.numeric_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), self.categorical_features),
            ('date', DateFeatureExtractor(), self.date_features),
            ('bin', 'passthrough', self.binary_features)
        ])
        return df
    
    def train_models(self, df, target_column='is_violation'):
        df = self.prepare_data(df)
        X = df.drop(columns=[target_column])
        y = df[target_column]
        X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                            test_size=0.2,
                                                            random_state=42,
                                                            stratify=y)
        # Define pipelines for different models
        rf_pipe = Pipeline([
            ('preprocessor', self.preprocessor),
            ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10,
                                                  class_weight='balanced', random_state=42))
        ])
        gb_pipe = Pipeline([
            ('preprocessor', self.preprocessor),
            ('classifier', GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                                      max_depth=5, random_state=42))
        ])
        # XGBoost pipeline
        xgb_pipe = Pipeline([
            ('preprocessor', self.preprocessor),
            ('classifier', xgb.XGBClassifier(n_estimators=100, learning_rate=0.1,
                                             max_depth=5, use_label_encoder=False,
                                             eval_metric='logloss', random_state=42))
        ])
        # Logistic Regression pipeline
        lr_pipe = Pipeline([
            ('preprocessor', self.preprocessor),
            ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42))
        ])
        
        self.models['RandomForest'] = rf_pipe
        self.models['GradientBoosting'] = gb_pipe
        self.models['XGBoost'] = xgb_pipe
        self.models['LogisticRegression'] = lr_pipe
        
        # Train each model and print results:
        for name, model in self.models.items():
            print(f"\nTraining {name} ...")
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            print(f"\n{name} Classification Report:")
            print(classification_report(y_test, y_pred))
            print("Confusion Matrix:")
            print(confusion_matrix(y_test, y_pred))
            
            # Save feature importances if available (only for tree-based models)
            classifier = model.named_steps['classifier']
            if hasattr(classifier, 'feature_importances_'):
                # Get one-hot encoded names
                cat_names = []
                if self.categorical_features:
                    cat_names = model.named_steps['preprocessor'].named_transformers_['cat'] \
                                .get_feature_names_out(self.categorical_features).tolist()
                feature_names = (self.numeric_features +
                                 cat_names +
                                 ['submission_delay', 'day_of_week', 'is_weekend', 'month', 'quarter', 'is_month_end'] +
                                 self.binary_features)
                self.feature_importances_[name] = pd.DataFrame({
                    'feature': feature_names,
                    'importance': classifier.feature_importances_
                }).sort_values('importance', ascending=False)
        return self.models
    
    def save_models(self, directory='models/'):
        import os
        if not os.path.exists(directory):
            os.makedirs(directory)
        for name, model in self.models.items():
            filepath = f"{directory}{name}_model.joblib"
            joblib.dump(model, filepath)
            print(f"Saved {name} model to {filepath}")
    
    def load_models(self, directory='models/'):
        import os
        for name in ['RandomForest', 'GradientBoosting', 'XGBoost', 'LogisticRegression']:
            filepath = f"{directory}{name}_model.joblib"
            self.models[name] = joblib.load(filepath)
            print(f"Loaded {name} model from {filepath}")
        return self
    
    def predict_all(self, df):
        # Expects df with the required columns
        df_prepared = self.prepare_data(df)
        columns_to_drop = []  # Already filtered in prepare_data
        predictions = {}
        probabilities = {}
        for name, model in self.models.items():
            preds = model.predict(df_prepared)
            probs = model.predict_proba(df_prepared)
            predictions[name] = preds
            probabilities[name] = probs
        return predictions, probabilities

# Instantiate and train models on the synthetic data (using only the desired columns)
ecm = ExpenseComplianceModel()
models = ecm.train_models(df_synthetic)
# Optionally save the models:
ecm.save_models()



Training RandomForest ...

RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1012
           1       0.99      0.95      0.97       988

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000

Confusion Matrix:
[[1002   10]
 [  50  938]]

Training GradientBoosting ...

GradientBoosting Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1012
           1       1.00      0.95      0.97       988

    accuracy                           0.97      2000
   macro avg       0.98      0.97      0.97      2000
weighted avg       0.98      0.97      0.97      2000

Confusion Matrix:
[[1012    0]
 [  52  936]]

Training XGBoost ...


Parameters: { "use_label_encoder" } are not used.




XGBoost Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1012
           1       1.00      0.95      0.97       988

    accuracy                           0.97      2000
   macro avg       0.97      0.97      0.97      2000
weighted avg       0.97      0.97      0.97      2000

Confusion Matrix:
[[1008    4]
 [  52  936]]

Training LogisticRegression ...

LogisticRegression Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.85      0.85      1012
           1       0.85      0.85      0.85       988

    accuracy                           0.85      2000
   macro avg       0.85      0.85      0.85      2000
weighted avg       0.85      0.85      0.85      2000

Confusion Matrix:
[[860 152]
 [145 843]]
Saved RandomForest model to models/RandomForest_model.joblib
Saved GradientBoosting model to models/GradientBoosting_model.joblib
Saved XGBoost model to

In [4]:
# %% Cell 3: New Expense Prediction and Compliance Audit

from datetime import datetime, timedelta

# Define rule-based compliance check function
def compliance_check(expense, allowed_budgets, allowed_categories, training_stats=None):
    """
    Check a single expense (dict) for:
      - Over-budget claims (if amount > allowed budget for the expense category)
      - Unauthorized category (if category is not allowed for the department)
      - Missing justification (if amount > allowed budget and notes is empty)
      - Outlier: if amount is beyond a threshold (if training_stats provided)
    Returns a dictionary of flags.
    """
    flags = {}
    cat = expense.get('category', None)
    dept = expense.get('department', None)
    amt = expense.get('amount', 0)
    notes = expense.get('notes', "").strip()  # may be empty
    # Rule 1: Over-budget
    if cat in allowed_budgets and amt > allowed_budgets[cat]:
        flags['Over Budget'] = f"Claimed amount {amt} INR exceeds allowed budget {allowed_budgets[cat]} INR for {cat}."
    # Rule 2: Unauthorized Category
    if dept in allowed_categories and cat not in allowed_categories[dept]:
        flags['Unauthorized Category'] = f"Category {cat} is not allowed for department {dept}."
    # Rule 3: Missing Justification
    if cat in allowed_budgets and amt > allowed_budgets[cat] and notes == "":
        flags['Missing Justification'] = "High amount claimed but justification (notes) is missing."
    # Rule 4: Outlier detection (if training_stats provided: median and std for amount)
    if training_stats is not None:
        median_amt = training_stats.get('median', 0)
        std_amt = training_stats.get('std', 0)
        threshold = median_amt + 3 * std_amt
        if amt > threshold:
            flags['Outlier'] = f"Claimed amount {amt} INR is unusually high compared to median {median_amt} INR."
    return flags

# Suppose we use the same allowed budgets and categories as in our synthetic generator:
allowed_budgets = {"Travel": 10000, "Meals": 3000, "Supplies": 5000}
allowed_categories = {
    "Engineering": ["Travel", "Meals", "Supplies"],
    "IT": ["Travel", "Supplies"],
    "Finance": ["Travel", "Meals"],
    "HR": ["Meals"],
    "Operations": ["Travel", "Meals", "Supplies"],
    "Sales": ["Travel", "Meals"],
    "Marketing": ["Travel", "Meals", "Supplies"]
}

# (Optionally) compute training stats for the 'amount' feature to detect outliers.
training_stats = {
    'median': df_synthetic['amount'].median(),
    'std': df_synthetic['amount'].std()
}

# Define the new expense row (using only the specified columns)
new_expense = {
    'expense_id': 201,
    'employee_id': 1500,
    'amount': 7500000000.00,  # Extremely high amount to trigger over-budget/outlier flag
    'receipt_quality': 0.65,
    'ocr_confidence': 0.90,
    'previous_violations': 1,
    'department': 'Engineering',
    'category': 'Travel',
    'currency': 'INR',
    'vendor_country': 'US',  # Vendor country may be suspicious
    'payment_method': 'Credit Card',
    'expense_date': (datetime.today() - timedelta(days=5)).strftime('%Y-%m-%d'),
    'submission_date': datetime.today().strftime('%Y-%m-%d'),
    'requires_approval': 1,
    'has_receipt': 1,
    'manual_review_required': 0,
    # Extra non-feature field (used only in rule-based check)
    'notes': ''  # missing justification
}

# For prediction via ML, we need to have a DataFrame with the features used in training.
# Note: We ignore extra fields like vendor_id, vendor_name, project_code, etc.
import pandas as pd
df_new = pd.DataFrame([new_expense])

# Load the trained models if not already in memory (here we assume they are in ecm)
# If running in a fresh kernel, uncomment the next two lines:
# ecm = ExpenseComplianceModel()
# ecm.load_models()

# Get predictions from all models:
predictions, probabilities = ecm.predict_all(df_new)
print("Predictions on new expense (per model):")
for model_name in predictions:
    pred = predictions[model_name][0]
    prob = max(probabilities[model_name][0]) * 100
    label = "Violation" if pred == 1 else "Normal"
    print(f"  {model_name}: {label} (Confidence: {prob:.2f}%)")

# Run rule-based compliance check:
compliance_flags = compliance_check(new_expense, allowed_budgets, allowed_categories, training_stats)

# Prepare a compliance report:
report = "\nPolicy Compliance Report\n"
report += f"* Expense ID: {new_expense.get('expense_id', 'N/A')}\n"
report += f"* Employee ID: {new_expense.get('employee_id', 'N/A')} | Department: {new_expense.get('department', 'N/A')}\n"
report += f"* Expense Category & Amount: {new_expense.get('category', 'N/A')} - {new_expense.get('amount', 'N/A')} INR\n"
if compliance_flags:
    report += "* Detected Violations:\n"
    for key, message in compliance_flags.items():
        report += f"   - {key}: {message}\n"
    report += "* Suggested Actions:\n"
    report += "   - Please review the expense policy, provide necessary justification and/or seek manager review.\n"
else:
    report += "* No Violations Detected.\n"
report += "\nCompliance check complete."

print(report)

Predictions on new expense (per model):
  RandomForest: Violation (Confidence: 64.17%)
  GradientBoosting: Violation (Confidence: 99.67%)
  XGBoost: Violation (Confidence: 99.44%)
  LogisticRegression: Violation (Confidence: 100.00%)

Policy Compliance Report
* Expense ID: 201
* Employee ID: 1500 | Department: Engineering
* Expense Category & Amount: Travel - 7500000000.0 INR
* Detected Violations:
   - Over Budget: Claimed amount 7500000000.0 INR exceeds allowed budget 10000 INR for Travel.
   - Missing Justification: High amount claimed but justification (notes) is missing.
   - Outlier: Claimed amount 7500000000.0 INR is unusually high compared to median 5292.245 INR.
* Suggested Actions:
   - Please review the expense policy, provide necessary justification and/or seek manager review.

Compliance check complete.


In [7]:
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from typing import Optional
import joblib
import pandas as pd
from datetime import datetime
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

# Custom Transformer for Date Features
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        X['submission_date'] = pd.to_datetime(X['submission_date'])
        X['expense_date'] = pd.to_datetime(X['expense_date'])
        X['submission_delay'] = (X['submission_date'] - X['expense_date']).dt.days
        X['day_of_week'] = X['expense_date'].dt.dayofweek
        X['is_weekend'] = X['day_of_week'].isin([5,6]).astype(int)
        X['month'] = X['expense_date'].dt.month
        X['quarter'] = X['expense_date'].dt.quarter
        X['is_month_end'] = X['expense_date'].dt.is_month_end.astype(int)
        return X.drop(['submission_date', 'expense_date'], axis=1)

# Input model for expense data
class ExpenseInput(BaseModel):
    amount: float
    receipt_quality: float
    ocr_confidence: float
    previous_violations: int
    department: str
    category: str
    currency: str
    vendor_country: str
    payment_method: str
    expense_date: str
    submission_date: str
    requires_approval: int
    has_receipt: int
    manual_review_required: int

app = FastAPI()

# Load the model at startup
model = None

@app.on_event("startup")
async def load_model():
    global model
    try:
        # Load the Random Forest model (or whichever model you prefer)
        model = joblib.load('models/RandomForest_model.joblib')
    except Exception as e:
        print(f"Error loading model: {e}")
        model = None

@app.post("/predict")
async def predict_expense(expense: ExpenseInput):
    if model is None:
        raise HTTPException(status_code=500, detail="Model not loaded")
    
    try:
        # Convert input to DataFrame
        df = pd.DataFrame([expense.dict()])
        
        # Get prediction and probability
        prediction = model.predict(df)[0]
        probabilities = model.predict_proba(df)[0]
        
        # Prepare response
        result = {
            "prediction": "Violation" if prediction == 1 else "Normal",
            "confidence": float(max(probabilities) * 100),
            "violation_probability": float(probabilities[1] * 100),
            "normal_probability": float(probabilities[0] * 100)
        }
        
        return result
    
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Prediction error: {str(e)}")

# Create a separate file named 'run.py' with this content:


        on_event is deprecated, use lifespan event handlers instead.

        Read more about it in the
        [FastAPI docs for Lifespan Events](https://fastapi.tiangolo.com/advanced/events/).
        
  @app.on_event("startup")


In [6]:
def start_server():
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)

if __name__ == "__main__":
    start_server()

RuntimeError: asyncio.run() cannot be called from a running event loop

In [15]:
!pip install openai

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: C:\Users\Bhavi\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [16]:
from flask import Flask, request, jsonify
import openai

# Flask app initialization
app = Flask(__name__)

# OpenAI API Key and Assistant ID
OPENAI_KEY = "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"
ASSISTANT_ID = "asst_EpFa1gBPouBslsTcGOsLBEGE"

# OpenAI API Setup
openai.api_key = OPENAI_KEY

@app.route('/validate_invoice', methods=['POST'])
def validate_invoice():
    try:
        # Get JSON data from request
        invoice_data = request.json

        # OpenAI API call
        response = openai.ChatCompletion.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "You are an expert in invoice validation."},
                {"role": "user", "content": f"Validate the following invoice and return if it's valid:\n{invoice_data}"}
            ]
        )

        # Extract and return response
        reply = response['choices'][0]['message']['content']
        return jsonify({"validation_result": reply})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True)


ImportError: cannot import name '_cv_request' from 'flask.globals' (C:\Users\Bhavi\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\flask\globals.py)

In [12]:
!pip install flask openai

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: C:\Users\Bhavi\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [13]:
!pip install --upgrade flask werkzeug

Defaulting to user installation because normal site-packages is not writeable
Collecting flask
  Using cached flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Using cached flask-3.1.0-py3-none-any.whl (102 kB)
Installing collected packages: flask
  Attempting uninstall: flask
    Found existing installation: Flask 2.1.3
    Uninstalling Flask-2.1.3:
      Successfully uninstalled Flask-2.1.3
Successfully installed flask-3.1.0



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: C:\Users\Bhavi\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [8]:
from flask import Flask, request, jsonify
import os
from openai import OpenAI  # Ensure OpenAI is correctly imported

# Flask app initialization
app = Flask(__name__)

# OpenAI API Setup
client = OpenAI(api_key="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")

@app.route('/validate_invoice', methods=['POST'])
def validate_invoice():
    try:
        # Get JSON data from request
        invoice_data = request.json

        if not invoice_data:
            return jsonify({"error": "No invoice data provided"}), 400

        # OpenAI API call
        response = client.chat.completions.create(
            model="gpt-4-turbo-preview",
            messages=[
                {"role": "system", "content": "You are an expert in invoice validation."},
                {"role": "user", "content": f"Validate the following invoice and return if it's valid:\n{invoice_data}"}
            ]
        )

        # Extract and return response
        reply = response.choices[0].message.content
        return jsonify({"validation_result": reply})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

if __name__ == '__main__':
    app.run(debug=True, host='0.0.0.0', port=5000)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://10.120.111.159:5000
Press CTRL+C to quit
 * Restarting with stat


SystemExit: 1

In [6]:
!export OPENAI_KEY = "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"

'export' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
from flask import Flask, request, jsonify
import openai
import os
from threading import Thread

# Flask app initialization
app = Flask(__name__)

# OpenAI API Setup (Hardcoded API Key and Assistant ID)
API_KEY = "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"
ASSISTANT_ID = "asst_EpFa1gBPouBslsTcGOsLBEGE"

client = openai.OpenAI(api_key=API_KEY)

@app.route('/validate_invoice', methods=['POST'])
def validate_invoice():
    try:
        # Get JSON data from request
        invoice_data = request.json

        if not invoice_data:
            return jsonify({"error": "No invoice data provided"}), 400

        # Create a new thread with the OpenAI Assistant
        thread = client.beta.threads.create()
        
        # Send the invoice data to the assistant
        client.beta.threads.messages.create(
            thread_id=thread.id,
            role="user",
            content=f"{invoice_data}"  # Sending raw JSON invoice data
        )

        # Run the assistant on this thread
        run = client.beta.threads.runs.create(
            thread_id=thread.id,
            assistant_id=ASSISTANT_ID
        )

        # Wait for the assistant to process the request
        while True:
            run_status = client.beta.threads.runs.retrieve(
                thread_id=thread.id,
                run_id=run.id
            )
            if run_status.status == "completed":
                break

        # Retrieve the final message from the assistant
        messages = client.beta.threads.messages.list(thread_id=thread.id)
        reply = messages.data[0].content[0].text.value  # Extract response

        return jsonify({"validation_result": reply})

    except Exception as e:
        return jsonify({"error": str(e)}), 500

# Function to run Flask without conflicts in Jupyter Notebook
def run_flask():
    app.run(debug=True, host='0.0.0.0', port=5000, use_reloader=False)

# Run Flask in a separate thread (so Jupyter doesn't crash)
flask_thread = Thread(target=run_flask)
flask_thread.start()


 * Serving Flask app '__main__'


 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://10.120.111.159:5000
Press CTRL+C to quit
127.0.0.1 - - [08/Feb/2025 23:57:20] "POST /validate_invoice HTTP/1.1" 200 -
127.0.0.1 - - [08/Feb/2025 23:59:44] "POST /validate_invoice HTTP/1.1" 200 -
127.0.0.1 - - [09/Feb/2025 00:01:13] "POST /validate_invoice HTTP/1.1" 200 -
127.0.0.1 - - [09/Feb/2025 00:04:25] "POST /validate_invoice HTTP/1.1" 200 -
127.0.0.1 - - [09/Feb/2025 00:04:42] "POST /validate_invoice HTTP/1.1" 200 -


In [4]:
!pip install openai

Collecting openai
  Using cached openai-1.61.1-py3-none-any.whl.metadata (27 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Using cached anyio-4.8.0-py3-none-any.whl.metadata (4.6 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Using cached httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp311-cp311-win_amd64.whl.metadata (5.3 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting sniffio (from openai)
  Using cached sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting tqdm>4 (from openai)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting idna>=2.8 (from anyio<5,>=3.5.0->openai)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting certifi (from httpx<1,>=0.23.0->openai)
  Using cached certifi-2025.1.31-p

In [2]:
!pip install flask

Collecting flask
  Using cached flask-3.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting Werkzeug>=3.1 (from flask)
  Using cached werkzeug-3.1.3-py3-none-any.whl.metadata (3.7 kB)
Collecting Jinja2>=3.1.2 (from flask)
  Using cached jinja2-3.1.5-py3-none-any.whl.metadata (2.6 kB)
Collecting itsdangerous>=2.2 (from flask)
  Using cached itsdangerous-2.2.0-py3-none-any.whl.metadata (1.9 kB)
Collecting click>=8.1.3 (from flask)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting blinker>=1.9 (from flask)
  Using cached blinker-1.9.0-py3-none-any.whl.metadata (1.6 kB)
Collecting MarkupSafe>=2.0 (from Jinja2>=3.1.2->flask)
  Downloading MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl.metadata (4.1 kB)
Using cached flask-3.1.0-py3-none-any.whl (102 kB)
Using cached blinker-1.9.0-py3-none-any.whl (8.5 kB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Using cached itsdangerous-2.2.0-py3-none-any.whl (16 kB)
Using cached jinja2-3.1.5-py3-none-any.whl (134 kB)
Using cached 

In [22]:
!pip install --upgrade openai

Defaulting to user installation because normal site-packages is not writeable
Collecting openai
  Using cached openai-1.61.1-py3-none-any.whl.metadata (27 kB)
Using cached openai-1.61.1-py3-none-any.whl (463 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.52.0
    Uninstalling openai-1.52.0:
      Successfully uninstalled openai-1.52.0
Successfully installed openai-1.61.1



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: C:\Users\Bhavi\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [19]:
!pip install "flask<2.3"

Defaulting to user installation because normal site-packages is not writeable
Collecting flask<2.3
  Downloading Flask-2.2.5-py3-none-any.whl.metadata (3.9 kB)
Downloading Flask-2.2.5-py3-none-any.whl (101 kB)
Installing collected packages: flask
  Attempting uninstall: flask
    Found existing installation: Flask 3.1.0
    Uninstalling Flask-3.1.0:
      Successfully uninstalled Flask-3.1.0
Successfully installed flask-2.2.5



[notice] A new release of pip is available: 24.3.1 -> 25.0
[notice] To update, run: C:\Users\Bhavi\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
