In [7]:
import pandas as pd
import numpy as np

In [21]:
# --- 1. Load your data files ---
try:
    # Load your main data file
    main_df = pd.read_csv(r"C:\Users\ASUS\Desktop\Research Work\Sonal_GIN_final_clean_code_file.csv")
    
    # Load the WIPO green codes file
    wipo_df = pd.read_csv('cleaned_WIPO_IPC_green_codes.csv')
    
    print("✅ Files loaded successfully!")
    print("Main data has {} rows.".format(len(main_df)))
    print("WIPO green codes file has {} codes.".format(len(wipo_df)))
    
except FileNotFoundError as e:
    print(f"Error loading files: {e}")
    print("Please make sure 'Sonal_GIN_cleaned_final_2.csv' and 'cleaned_WIPO_IPC_green_codes.csv' are in the same folder.")

✅ Files loaded successfully!
Main data has 37297 rows.
WIPO green codes file has 1214 codes.


In [22]:
# --- 2. Prepare the list of Green Codes for matching ---

# Use the 'clean_code' column from the WIPO file
# We drop any potential missing values and ensure all codes are strings
green_codes_list = wipo_df['clean_code'].dropna().astype(str).tolist()

# Create a single, powerful regular expression pattern by joining all codes with '|' (OR)
# This is highly efficient for searching
green_code_pattern = '|'.join(green_codes_list)
print(f"\nCreated a search pattern with {len(green_codes_list)} green codes.")




Created a search pattern with 1214 green codes.


In [23]:
# --- 3. Classify the 'Cleaned_ICR' column ---

if 'Cleaned_ICR' in main_df.columns:
    print("Classifying 'Cleaned_ICR' column...")
    # Ensure the target column is a string type to prevent errors
    main_df['Cleaned_ICR'] = main_df['Cleaned_ICR'].astype(str)
    
    # .str.contains() checks each cell for any of the green codes in our pattern
    matches_icr = main_df['Cleaned_ICR'].str.contains(green_code_pattern, na=False)
    
    # Create the new classification column based on the results
    main_df['ICR_Code_Type'] = np.where(matches_icr, 'Green', 'Non-Green')
    print("-> 'Cleaned_ICR' column classified.")
else:
    print("Warning: 'Cleaned_ICR' column not found in the main file.")


Classifying 'Cleaned_ICR' column...
-> 'Cleaned_ICR' column classified.


In [24]:
# --- 4. Classify the 'Cleaned_IC' column ---

if 'Cleaned_IC' in main_df.columns:
    print("Classifying 'Cleaned_IC' column...")
    # Ensure the target column is a string type
    main_df['Cleaned_IC'] = main_df['Cleaned_IC'].astype(str)
    
    # Perform the same matching operation on the 'Cleaned_IC' column
    matches_ic = main_df['Cleaned_IC'].str.contains(green_code_pattern, na=False)
    
    # Create the second classification column
    main_df['IC_Code_Type'] = np.where(matches_ic, 'Green', 'Non-Green')
    print("-> 'Cleaned_IC' column classified.")
else:
    print("Warning: 'Cleaned_IC' column not found in the main file.")

Classifying 'Cleaned_IC' column...
-> 'Cleaned_IC' column classified.


In [25]:
# --- 5. Save and display the final result ---

output_filename = 'Sonal_GIN_classified_green_codes_final_final.csv'
main_df.to_csv(output_filename, index=False)

print(f"\n✅ Processing complete! The final file has been saved as '{output_filename}'")

# Display the head of the DataFrame with the new columns
print("\nHere is a preview of your final data with the new classification columns:")
# We select the key columns to make the preview clear
display_cols = ['Cleaned_ICR', 'ICR_Code_Type', 'Cleaned_IC', 'IC_Code_Type']
existing_display_cols = [col for col in display_cols if col in main_df.columns]
print(main_df[existing_display_cols].head(10))


✅ Processing complete! The final file has been saved as 'Sonal_GIN_classified_green_codes_final_final.csv'

Here is a preview of your final data with the new classification columns:
                                     Cleaned_ICR ICR_Code_Type  \
0                               A01G2500;A01G900         Green   
1                                        A01H100         Green   
2                                       C07K1400     Non-Green   
3                                        A01H100         Green   
4                                        B10N335     Non-Green   
5  B60R2138;B29C64106;B60W3000;E05B7712;B62D3704     Non-Green   
6                                            nan     Non-Green   
7                                        B60N200     Non-Green   
8                                       B60K1700     Non-Green   
9       B60N230;B60N202;B64D1106;B60N232;B60N242     Non-Green   

                                      Cleaned_IC IC_Code_Type  
0                         