In [1]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import string

# --- File Identification ---
# Use the correct file name of the uploaded CSV file
file_name = r"D:\AI-Powered Ticket Creation & Categorization\Team 1 Dataset\Customer_Complaints_Updated.csv"

# --- 1. Load Data ---
# Load the dataset
df = pd.read_csv(file_name)

1. Lowercasing
2. Remove Unnecessary Characters
3. Remove Punctuation
4. Remove Stopwords
5. Tokenization
6. Lemmatization / Stemming
7. Remove Duplicates
8. Handling Spelling Mistakes
9. Remove Very Short or Irrelevant Text
10. Encode the Text for ML

In [2]:

# --- 2. Text Preprocessing Function (Minimal Cleaning - NLTK-Free) ---
def minimal_clean_text(text):
    """Performs essential cleaning (lowercase, remove punctuation/numbers)"""
    if pd.isna(text):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove punctuation and numbers
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply cleaning
df['clean_complaint'] = df['Complaint'].apply(minimal_clean_text)



In [3]:

# --- 3. Encoding Categorical Variables ---

# Category Encoding (Target for Classification)
le_cat = LabelEncoder()
df['Category_encoded'] = le_cat.fit_transform(df['Category'])
category_mapping = dict(zip(le_cat.classes_, le_cat.transform(le_cat.classes_)))

# Urgency Encoding (Target for Priority Classification)
le_urg = LabelEncoder()
df['Urgency_encoded'] = le_urg.fit_transform(df['Urgency'])
urgency_mapping = dict(zip(le_urg.classes_, le_urg.transform(le_urg.classes_)))



In [4]:

# --- 4. Final Inspection and Saving ---
output_file = "Customer_Complaints_Updated_Cleaned.csv"
df.to_csv(output_file, index=False)

print("Milestone 1: Data Preparation and Annotation Complete.")
print(f"Processed data saved to {output_file}")
print("\nFirst 5 rows of the final processed data:")
print(df[['Complaint', 'clean_complaint', 'Category', 'Category_encoded', 'Urgency', 'Urgency_encoded']].head())


Milestone 1: Data Preparation and Annotation Complete.
Processed data saved to Customer_Complaints_Updated_Cleaned.csv

First 5 rows of the final processed data:
                                           Complaint  \
0   I can’t add a new address — button does nothing.   
1  Flash sale prices aren’t reflected during chec...   
2  System declined my payment without any reason....   
3  The shipping company marked it as delivered to...   
4   Site logs me out randomly. which is unacceptable   

                                     clean_complaint            Category  \
0    i can’t add a new address — button does nothing  Account & Shipping   
1  flash sale prices aren’t reflected during chec...          Promotions   
2  system declined my payment without any reason ...            Payments   
3  the shipping company marked it as delivered to...      Order Tracking   
4    site logs me out randomly which is unacceptable    Technical Issues   

   Category_encoded    Urgency  Urgency_enco

In [5]:
print("\nCategory Encoding Map:")
print(category_mapping)
print("\nUrgency Encoding Map:")
print(urgency_mapping)


Category Encoding Map:
{'Account & Shipping': 0, 'Order Issues': 1, 'Order Tracking': 2, 'Payments': 3, 'Product Issues': 4, 'Promotions': 5, 'Refunds': 6, 'Technical Issues': 7}

Urgency Encoding Map:
{'High': 0, 'Immediate': 1, 'Low': 2, 'Medium': 3}
