# Data Wrangling Project

In [2]:
import pandas as pd

In [3]:
# Load datasets
try:
    invoices_df = pd.read_csv("synthetic_invoices.csv")
    vials_df = pd.read_csv("synthetic_vials.csv")
    dispense_log_df = pd.read_csv("synthetic_dispense_log.csv")
    claims_df = pd.read_csv("synthetic_claims.csv")
except FileNotFoundError as e:
    print(f"Error loading data: {e}. Please ensure all CSV files are in the correct directory.")
    raise

## Data Collection

In [5]:
# Clean 'Purchase Price' in vials_df
vials_df['Purchase Price'] = vials_df['Purchase Price'].replace({'\$': ''}, regex=True).astype(float)

  vials_df['Purchase Price'] = vials_df['Purchase Price'].replace({'\$': ''}, regex=True).astype(float)


In [6]:
# Merge datasets
vial_invoice_df = pd.merge(vials_df, invoices_df, on="Invoice Number", how="left")
full_df = pd.merge(vial_invoice_df, dispense_log_df, on="Vial Number", how="left")
full_df = pd.merge(full_df, claims_df, on="Dispense ID", how="left")

## Data Cleaning

In [8]:
# Check for duplicates
duplicates = full_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")

Number of duplicate rows: 0


In [9]:
# Remove duplicates if any
if duplicates > 0:
    full_df = full_df.drop_duplicates()
    print("Duplicates removed.")
else:
    print("No duplicates found.")

No duplicates found.


In [10]:
# Check for missing values
missing_values = full_df.isnull().sum()
print("Missing values in each column:")
print(missing_values[missing_values > 0])

Missing values in each column:
Dispense ID         4754
Patient ID          4754
Date Of Dispense    4754
Username            4754
Unscanned           4754
Claim ID            4754
Claim Date          4754
Amount              4754
Denied              4754
dtype: int64


In [11]:
# Handle missing values
full_df.fillna(method='bfill', inplace=True)
print("Missing values handled.")

Missing values handled.


  full_df.fillna(method='bfill', inplace=True)


## Data Organization

In [13]:
full_df.head()

Unnamed: 0,Vial Number,Lot Number,Expiration Date,Purchase Price,Invoice Number,Invoice Date,Total Amount,Provider,Location,Dispense ID,Patient ID,Date Of Dispense,Username,Unscanned,Claim ID,Claim Date,Amount,Denied
0,VN-0000001,LOT-5169460,2026-04-30,50.0,INV-000402,2025-07-28,902.06,Eye Care BigTown,BigTown,DISP-000293,676660.0,2025-04-15,asha.lou,False,CLM-000293,2025-07-28,153.83,False
1,VN-0000002,LOT-5228790,2026-04-30,50.0,INV-000637,2025-06-15,2237.03,EyeCare Smallville,Smallville,DISP-000293,676660.0,2025-04-15,asha.lou,False,CLM-000293,2025-07-28,153.83,False
2,VN-0000003,LOT-5134343,2026-04-30,50.0,INV-000757,2025-07-06,2480.64,EyeCare Smallville,BigTown,DISP-004397,962805.0,2025-04-10,jen.kirby,False,CLM-004397,2025-06-19,116.12,False
3,VN-0000004,LOT-4417439,2026-04-30,50.0,INV-000284,2025-04-26,2514.06,EyeCare Smallville,BigTown,DISP-004397,962805.0,2025-04-10,jen.kirby,False,CLM-004397,2025-06-19,116.12,False
4,VN-0000005,LOT-8583121,2026-04-30,50.0,INV-000303,2025-02-12,4599.87,EyeCare Smallville,Smallville,DISP-006754,464413.0,2025-04-12,kasey.convertem,False,CLM-006754,2025-06-11,124.73,False


In [14]:
# Analyze and report
print("--- Ophthalmology Drug Tracking Analysis ---")

# 1. Identify Unscanned Vials
unscanned_vials = full_df[full_df['Unscanned'] == True]
print(f"\n## Unscanned Vials Report ({len(unscanned_vials)} found)")
if not unscanned_vials.empty:
    print("The following vials were dispensed but not scanned, leading to potential revenue loss:")
    print(unscanned_vials[['Vial Number', 'Lot Number', 'Purchase Price', 'Dispense ID', 'Patient ID', 'Date Of Dispense', 'Username']])
else:
    print("No unscanned vials found.")

--- Ophthalmology Drug Tracking Analysis ---

## Unscanned Vials Report (900 found)
The following vials were dispensed but not scanned, leading to potential revenue loss:
      Vial Number   Lot Number  Purchase Price  Dispense ID  Patient ID  \
6      VN-0000005  LOT-8583121            50.0  DISP-010328    567386.0   
34     VN-0000025  LOT-5251569            50.0  DISP-003474    930415.0   
49     VN-0000037  LOT-8468932            50.0  DISP-002492    435402.0   
50     VN-0000038  LOT-1985938            50.0  DISP-002492    435402.0   
71     VN-0000050  LOT-8239582            50.0  DISP-007851    722927.0   
...           ...          ...             ...          ...         ...   
15642  VN-0011909  LOT-9133802            50.0  DISP-005509    882115.0   
15643  VN-0011910  LOT-7320978            50.0  DISP-005509    882115.0   
15702  VN-0011958  LOT-7102116            50.0  DISP-006094    820014.0   
15703  VN-0011959  LOT-3824453            50.0  DISP-006094    820014.0   
1573

In [15]:
# 2. Identify Denied Claims
denied_claims = full_df[full_df['Denied'] == True]
print(f"\n## Denied Claims Report ({len(denied_claims)} found)")
if not denied_claims.empty:
    print("The following claims were denied by insurance:")
    print(denied_claims[['Claim ID', 'Dispense ID', 'Vial Number', 'Patient ID', 'Claim Date', 'Amount']])
else:
    print("No denied claims found.")


## Denied Claims Report (590 found)
The following claims were denied by insurance:
         Claim ID  Dispense ID Vial Number  Patient ID  Claim Date  Amount
6      CLM-010328  DISP-010328  VN-0000005    567386.0  2025-09-09   92.40
49     CLM-002492  DISP-002492  VN-0000037    435402.0  2025-03-26  102.98
50     CLM-002492  DISP-002492  VN-0000038    435402.0  2025-03-26  102.98
71     CLM-007851  DISP-007851  VN-0000050    722927.0  2025-02-22  119.08
76     CLM-000043  DISP-000043  VN-0000055    702136.0  2025-05-07   65.80
...           ...          ...         ...         ...         ...     ...
15583  CLM-005342  DISP-005342  VN-0011867    409574.0  2025-04-21  155.99
15642  CLM-005509  DISP-005509  VN-0011909    882115.0  2025-03-19  174.90
15643  CLM-005509  DISP-005509  VN-0011910    882115.0  2025-03-19  174.90
15702  CLM-006094  DISP-006094  VN-0011958    820014.0  2025-07-11  180.63
15703  CLM-006094  DISP-006094  VN-0011959    820014.0  2025-07-11  180.63

[590 rows x 6 c

In [16]:
# 3. Profitability Analysis
reimbursed_claims = full_df[(full_df['Denied'] == False) & (full_df['Amount'].notna())]
reimbursed_claims['Profit'] = reimbursed_claims['Amount'] - reimbursed_claims['Purchase Price']

total_revenue = reimbursed_claims['Amount'].sum()
total_cost = reimbursed_claims['Purchase Price'].sum()
total_profit = reimbursed_claims['Profit'].sum()

print("\n## Profitability Analysis")
print(f"Total Revenue from Reimbursed Claims: ${total_revenue:,.2f}")
print(f"Total Cost of Goods Sold: ${total_cost:,.2f}")
print(f"Total Profit: ${total_profit:,.2f}")

if not reimbursed_claims.empty:
    print("\n### Top 5 Most Profitable Vials:")
    print(reimbursed_claims.nlargest(5, 'Profit')[['Vial Number', 'Purchase Price', 'Amount', 'Profit']])
    print("\n### Top 5 Least Profitable Vials:")
    print(reimbursed_claims.nsmallest(5, 'Profit')[['Vial Number', 'Purchase Price', 'Amount', 'Profit']])


## Profitability Analysis
Total Revenue from Reimbursed Claims: $1,978,121.40
Total Cost of Goods Sold: $758,150.00
Total Profit: $1,219,971.40

### Top 5 Most Profitable Vials:
      Vial Number  Purchase Price  Amount  Profit
15     VN-0000010            50.0  199.98  149.98
2695   VN-0002064            50.0  199.96  149.96
13870  VN-0010556            50.0  199.94  149.94
635    VN-0000473            50.0  199.91  149.91
13145  VN-0010004            50.0  199.91  149.91

### Top 5 Least Profitable Vials:
      Vial Number  Purchase Price  Amount  Profit
14142  VN-0010762            50.0   60.01   10.01
6070   VN-0004648            50.0   60.03   10.03
6071   VN-0004649            50.0   60.03   10.03
7035   VN-0005378            50.0   60.03   10.03
14811  VN-0011287            50.0   60.03   10.03


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reimbursed_claims['Profit'] = reimbursed_claims['Amount'] - reimbursed_claims['Purchase Price']
