# IPC CODES CLEANING 

### STEP 1: Installing the required Python libraries 

In [43]:
!pip install pandas



### STEP 2: Importing all the necessary libraries and modules

In [67]:
import pandas as pd
import numpy as np
import re

### STEP 3: Loading and preview of the working file to be cleaned

In [68]:
try:
    # Load the CSV file
    df = pd.read_csv(r"C:\Users\ASUS\Desktop\Research Work\main working file\Sonal_GIN.csv")

    # Display the first 5 rows to get an overview
    print("----------- Initial Data Preview -----------")
    print(df.head())

    # Get a summary of the DataFrame
    print("\n----------- DataFrame Info -----------")
    df.info()

except FileNotFoundError:
    print("Error: 'Sonal_AIN.csv' not found. Make sure the file is in the same directory as your notebook.")
except Exception as e:
    print(f"An error occurred: {e}")

----------- Initial Data Preview -----------
  Record Number Record Type Publication/Issue Date Filing/Application Date  \
0    IN548672A1      patent       27-08-2024 00:00              12-07-2016   
1    IN551134A1      patent       26-09-2024 00:00              10-02-2006   
2    IN467364A1      patent       08-11-2023 00:00              01-04-2016   
3    IN496363A1      patent       09-01-2024 00:00              05-03-2015   
4    IN248627A1      patent       29-07-2011 00:00              23-12-2003   

  Publication Country Priority Date (Record)  Application No.  \
0                  IN                    NaN  IN201621023874A   
1                  IN                    NaN     IN368DE2006A   
2                  IN                    NaN  IN201621011694A   
3                  IN                    NaN     IN627DE2015A   
4                  IN                    NaN    IN1600DE2003A   

  Application No. Original Priority Country Code  Priority Year  ...  \
0           IN201621023

### STEP 4: Main Cleaning logic of the IPC Codes

In [69]:
# IPC Cleaning function with all the logic.

def clean_ipc_code(code):
    """
    Cleans and formats IPC codes with all rules:
    1. Handles all types of empty/missing values precisely.
    2. Removes leading commas, spaces, and slashes.
    3. Converts to uppercase.
    4. Splits concatenated codes by inserting a ';' before every 3rd, 5th, etc., letter.
    """
    # Rule: Handle truly missing values (NaN)
    if pd.isna(code):
        return None

    code_str = str(code)

    # Rule: Handle text "None" or blank strings
    if code_str.strip().upper() == 'NONE':
        return None
    if not code_str.strip():
        return ""

    # --- Initial Cleaning ---
    cleaned_code = code_str.strip().lstrip(',')
    cleaned_code = cleaned_code.upper()
    cleaned_code = cleaned_code.replace('/', '').replace(' ', '')

    # Removes any comma that immediately follows a semicolon
    cleaned_code = cleaned_code.replace(';,', ';')

    # --- New Formatting Logic ---
    def format_ipc_chunk(chunk):
        """Helper to process a single string chunk between semicolons."""
        if not chunk:
            return ""
        
        result = []
        letter_count = 0
        for char in chunk:
            if char.isalpha():
                letter_count += 1
                # If it's an odd-numbered letter count beyond the first one, add a separator
                if letter_count > 1 and letter_count % 2 != 0:
                    result.append(';')
            result.append(char)
        return "".join(result)

    # Split by any existing semicolons first
    code_chunks = cleaned_code.split(';')
    
    # Apply the new formatting to each chunk
    formatted_chunks = [format_ipc_chunk(chunk) for chunk in code_chunks]
    
    # Join them back together
    final_code = ";".join(formatted_chunks)
    
    return final_code

### STEP 5: Select the Columns to be cleaned and insert the new cleaned columns

In [70]:
# Make a copy to avoid modifying the original DataFrame
df_cleaned = df.copy()

# Clean 'IPC Revised (ICR)' and create a new column
if 'IPC Revised (ICR)' in df_cleaned.columns:
    df_cleaned['Cleaned_ICR'] = df_cleaned['IPC Revised (ICR)'].apply(clean_ipc_code)
    print("\nSuccessfully cleaned 'IPC Revised (ICR)' column.")
else:
    print("\nWarning: 'IPC Revised (ICR)' column not found.")


# Clean 'IPC All Versions (IC)' and create a new column
if 'IPC All Versions (IC)' in df_cleaned.columns:
    df_cleaned['Cleaned_IC'] = df_cleaned['IPC All Versions (IC)'].apply(clean_ipc_code)
    print("Successfully cleaned 'IPC All Versions (IC)' column.")
else:
    print("Warning: 'IPC All Versions (IC)' column not found.")


Successfully cleaned 'IPC Revised (ICR)' column.
Successfully cleaned 'IPC All Versions (IC)' column.


### STEP 6: Comparing the new columns with the old columns 

In [1]:
# Select the original and new columns to compare
columns_to_show = []
if 'IPC Revised (ICR)' in df_cleaned.columns:
    columns_to_show.extend(['IPC Revised (ICR)', 'Cleaned_ICR'])

if 'IPC All Versions (IC)' in df_cleaned.columns:
    columns_to_show.extend(['IPC All Versions (IC)', 'Cleaned_IC'])


if columns_to_show:
    print("\n----------- Verification: Original vs. Cleaned -----------")
    print(df_cleaned[columns_to_show].head())
else:
    print("\nNo IPC columns were found to display.")

NameError: name 'df_cleaned' is not defined

### STEP 7: Save the cleaned files

In [72]:
# Save the new DataFrame with cleaned columns to a new CSV file
try:
    df_cleaned.to_csv('Sonal_GIN_final_clean_code_file.csv', index=False)
    print("\nSuccessfully saved the cleaned data to 'Sonal_GIN_final_clean_code_file.csv'")
except Exception as e:
    print(f"\nAn error occurred while saving the file: {e}")


Successfully saved the cleaned data to 'Sonal_GIN_final_clean_code_file.csv'
