In [4]:
import pandas as pd
import os
file_path = "cleaned_sales_dataset.xlsx"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"File not found at: {file_path}")
df = pd.read_excel(file_path)
print("ðŸ”¹ DataFrame shape (rows, columns):")
print(df.shape)

print("\nðŸ”¹ Column names:")
print(df.columns.tolist())
print("\nðŸ”¹ First 10 rows of DataFrame:")
print(df.head(10))
print("\nðŸ”¹ First 20 rows (full width):")
print(df.head(20).to_string(index=False))
print("\nðŸ”¹ Data types:")
print(df.dtypes)
invoice_rows = df[
    df["FIRM"].notna() &
    df["FIRM_PRODUCT"].isna()
]

print("\nðŸ”¹ Invoice-level rows preview:")
print(invoice_rows.head(10).to_string(index=False))


ðŸ”¹ DataFrame shape (rows, columns):
(1753, 15)

ðŸ”¹ Column names:
['Date', 'FIRM', 'FIRM_PRODUCT', 'Voucher Type', 'Voucher No.', 'GSTIN/UIN', 'Quantity', 'Alt. Units', 'Rate', 'Value', 'Gross Total', 'SALE @18% LOCAL', 'CGST@9%', 'SGST@9%', 'Round Off']

ðŸ”¹ First 10 rows of DataFrame:
        Date                         FIRM  \
0 2025-12-01     SHREE BALAJI ENTERPRISES   
1        NaT     SHREE BALAJI ENTERPRISES   
2 2025-12-01  WORLDWIDE SECURITY SOLUTION   
3        NaT  WORLDWIDE SECURITY SOLUTION   
4        NaT  WORLDWIDE SECURITY SOLUTION   
5 2025-12-01          AVIATION TECHNOLOGY   
6        NaT          AVIATION TECHNOLOGY   
7 2025-12-01          QUADRA SECURITY LLP   
8        NaT          QUADRA SECURITY LLP   
9        NaT          QUADRA SECURITY LLP   

                                        FIRM_PRODUCT Voucher Type  \
0                                                NaN        Sales   
1                          HIK VDP ANALOG DS-KIS204T          NaN   
2    

In [5]:
invoice_rows_clean = invoice_rows.copy()


In [6]:
cols_to_drop = [ "FIRM_PRODUCT", "Alt. Units", "Rate"]

invoice_rows_clean = invoice_rows_clean.drop(
    columns=cols_to_drop,
    errors="ignore"   # agar koi column missing ho to crash na ho
)


In [7]:
print("\nðŸ”¹ Cleaned Invoice-level DataFrame preview:")
print(invoice_rows_clean.head(10).to_string(index=False))

print("\nðŸ”¹ Columns after drop:")
print(invoice_rows_clean.columns.tolist())



ðŸ”¹ Cleaned Invoice-level DataFrame preview:
      Date                          FIRM Voucher Type    Voucher No.       GSTIN/UIN  Quantity     Value  Gross Total  SALE @18% LOCAL  CGST@9%  SGST@9%  Round Off
2025-12-01      SHREE BALAJI ENTERPRISES        Sales SAS/25-26/7782 06BXMPL2769M1ZR       1.0   3855.93       4550.0          3855.93   347.03   347.03       0.01
2025-12-01   WORLDWIDE SECURITY SOLUTION        Sales SAS/25-26/7783 06DQJPK7817P1Z2      12.0  28177.92      33250.0         28177.92  2536.01  2536.01       0.06
2025-12-01           AVIATION TECHNOLOGY        Sales SAS/25-26/7784 06ABCFA7434K1ZF       1.0    847.46       1000.0           847.46    76.27    76.27        NaN
2025-12-01           QUADRA SECURITY LLP        Sales SAS/25-26/7785 06AABFQ3487G1Z6     160.0 266232.10     314154.0        266232.10 23960.91 23960.91       0.08
2025-12-01            KWALITY TECHNOLOGY        Sales SAS/25-26/7786 06AJZPC7774M1Z9      13.0  11059.34      13050.0         11059.3

In [8]:
print("ðŸ”¹ Data types of invoice_rows_clean:\n")
print(invoice_rows_clean.dtypes)


ðŸ”¹ Data types of invoice_rows_clean:

Date               datetime64[ns]
FIRM                       object
Voucher Type               object
Voucher No.                object
GSTIN/UIN                  object
Quantity                  float64
Value                     float64
Gross Total               float64
SALE @18% LOCAL           float64
CGST@9%                   float64
SGST@9%                   float64
Round Off                 float64
dtype: object


In [10]:
invoice_rows_clean["Date"] = invoice_rows_clean["Date"].dt.date


In [11]:
invoice_rows_clean["total_tax_amount"] = (
    invoice_rows_clean["CGST@9%"].fillna(0) +
    invoice_rows_clean["SGST@9%"].fillna(0)
)


In [12]:
print("\nðŸ”¹ Preview after changes:\n")


ðŸ”¹ Preview after changes:



In [14]:
required_columns = [
    "Date",
    "FIRM",
    "Voucher No.",
    "Quantity",
    "Value",
    "GSTIN/UIN",
    "total_tax_amount",
    "Gross Total"
]


In [15]:
final_invoice_df = invoice_rows_clean[
    [col for col in required_columns if col in invoice_rows_clean.columns]
].copy()


In [16]:
print("\nðŸ”¹ Final invoice dataset preview:\n")
print(final_invoice_df.head(10).to_string(index=False))

print("\nðŸ”¹ Columns being saved:\n")
print(final_invoice_df.columns.tolist())



ðŸ”¹ Final invoice dataset preview:

      Date                          FIRM    Voucher No.  Quantity     Value       GSTIN/UIN  total_tax_amount  Gross Total
2025-12-01      SHREE BALAJI ENTERPRISES SAS/25-26/7782       1.0   3855.93 06BXMPL2769M1ZR            694.06       4550.0
2025-12-01   WORLDWIDE SECURITY SOLUTION SAS/25-26/7783      12.0  28177.92 06DQJPK7817P1Z2           5072.02      33250.0
2025-12-01           AVIATION TECHNOLOGY SAS/25-26/7784       1.0    847.46 06ABCFA7434K1ZF            152.54       1000.0
2025-12-01           QUADRA SECURITY LLP SAS/25-26/7785     160.0 266232.10 06AABFQ3487G1Z6          47921.82     314154.0
2025-12-01            KWALITY TECHNOLOGY SAS/25-26/7786      13.0  11059.34 06AJZPC7774M1Z9           1990.68      13050.0
2025-12-01     RP MULTI SOLUTION PVT LTD SAS/25-26/7787       1.0    889.83 06AALCR2823A1ZR            160.16       1050.0
2025-12-01           MANISH CCTV GURGAON SAS/25-26/7788      10.0   1237.29             NaN          

In [17]:
output_path = "Table_second.xlsx"

final_invoice_df.to_excel(output_path, index=False)

print(f"\nâœ… Final invoice Excel file saved at:\n{output_path}")



âœ… Final invoice Excel file saved at:
Table_second.xlsx


In [21]:
final_invoice_df.isnull().sum()

Date                 0
FIRM                 0
Voucher No.          0
Quantity             2
Value                2
GSTIN/UIN           55
total_tax_amount     0
Gross Total          2
dtype: int64