# Data Wrangling Project



In [None]:
import pandas as pd

# Load datasets
try:
    invoices_df = pd.read_csv("synthetic_invoices.csv")
    vials_df = pd.read_csv("synthetic_vials.csv")
    dispense_log_df = pd.read_csv("synthetic_dispense_log.csv")
    claims_df = pd.read_csv("synthetic_claims.csv")
except FileNotFoundError as e:
    print(f"Error loading data: {e}. Please ensure all CSV files are in the correct directory.")
    raise


## Data Collection



In [None]:
# Clean 'Purchase Price' in vials_df
vials_df['Purchase Price'] = vials_df['Purchase Price'].replace({'\$': ''}, regex=True).astype(float)

# Merge datasets
vial_invoice_df = pd.merge(vials_df, invoices_df, on="Invoice Number", how="left")
full_df = pd.merge(vial_invoice_df, dispense_log_df, on="Vial Number", how="left")
full_df = pd.merge(full_df, claims_df, on="Dispense ID", how="left")


## Data Organization



In [None]:
# Analyze and report
print("--- Ophthalmology Drug Tracking Analysis ---")

# 1. Identify Unscanned Vials
unscanned_vials = full_df[full_df['Unscanned'] == True]
print(f"\n## Unscanned Vials Report ({len(unscanned_vials)} found)")
if not unscanned_vials.empty:
    print("The following vials were dispensed but not scanned, leading to potential revenue loss:")
    print(unscanned_vials[['Vial Number', 'Lot Number', 'Purchase Price', 'Dispense ID', 'Patient ID', 'Date Of Dispense', 'Username']])
else:
    print("No unscanned vials found.")

# 2. Identify Denied Claims
denied_claims = full_df[full_df['Denied'] == True]
print(f"\n## Denied Claims Report ({len(denied_claims)} found)")
if not denied_claims.empty:
    print("The following claims were denied by insurance:")
    print(denied_claims[['Claim ID', 'Dispense ID', 'Vial Number', 'Patient ID', 'Claim Date', 'Amount']])
else:
    print("No denied claims found.")

# 3. Profitability Analysis
reimbursed_claims = full_df[(full_df['Denied'] == False) & (full_df['Amount'].notna())]
reimbursed_claims['Profit'] = reimbursed_claims['Amount'] - reimbursed_claims['Purchase Price']

total_revenue = reimbursed_claims['Amount'].sum()
total_cost = reimbursed_claims['Purchase Price'].sum()
total_profit = reimbursed_claims['Profit'].sum()

print("\n## Profitability Analysis")
print(f"Total Revenue from Reimbursed Claims: ${total_revenue:,.2f}")
print(f"Total Cost of Goods Sold: ${total_cost:,.2f}")
print(f"Total Profit: ${total_profit:,.2f}")

if not reimbursed_claims.empty:
    print("\n### Top 5 Most Profitable Vials:")
    print(reimbursed_claims.nlargest(5, 'Profit')[['Vial Number', 'Purchase Price', 'Amount', 'Profit']])
    print("\n### Top 5 Least Profitable Vials:")
    print(reimbursed_claims.nsmallest(5, 'Profit')[['Vial Number', 'Purchase Price', 'Amount', 'Profit']])


## Data Cleaning



In [None]:
# Check for duplicates
duplicates = full_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

# Remove duplicates if any
if duplicates > 0:
    full_df = full_df.drop_duplicates()
    print("Duplicates removed.")
else:
    print("No duplicates found.")

# Check for missing values
missing_values = full_df.isnull().sum()
print("Missing values in each column:")
print(missing_values[missing_values > 0])

# Handle missing values (example: fill with mean or drop)
full_df.fillna(method='ffill', inplace=True)  # Forward fill as an example
print("Missing values handled.")


## Conclusion

