In [1]:
import pandas as pd
import numpy as np
import ast
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

# Load data
df = pd.read_csv('../data/raw/tickets.csv')

# Basic inspection
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
df.head()

Dataset shape: (1000, 2)
Columns: ['text', 'categories']


Unnamed: 0,text,categories
0,"My payment was processed twice, and I only ord...",['Billing Issue']
1,I forgot my password and can't get back into m...,['Account Access']
2,The product I received doesn't match the descr...,['Product Inquiry']
3,I tried to update my shipping address but it k...,['Shipping Concern']
4,I requested a refund two weeks ago and still h...,['Refund Request']


In [2]:
# Convert string representations of lists to actual lists
df['categories'] = df['categories'].apply(ast.literal_eval)
print("n\Categories after conversion:")
print(df['categories'].head())

n\Categories after conversion:
0       [Billing Issue]
1      [Account Access]
2     [Product Inquiry]
3    [Shipping Concern]
4      [Refund Request]
Name: categories, dtype: object


In [3]:
print("Type after conversion:", type(df['categories'].iloc[0]))
print("First category:", df['categories'].iloc[0])
print("First item in first category:", df['categories'].iloc[0][0])

Type after conversion: <class 'list'>
First category: ['Billing Issue']
First item in first category: Billing Issue


In [4]:
def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove extra whitespaces
    text = re.sub(r'\s+',' ', text)
    # remove leading/trailing spaces
    text = text.strip()
    return text

df['text_clean'] = df['text'].apply(clean_text)

In [5]:
# Check category distribution
all_categories = [cat for sublist in df['categories'] for cat in sublist]
category_counts = pd.Series(all_categories).value_counts()
print("Category distribution:")
print(category_counts)

Category distribution:
Billing Issue        288
Technical Problem    280
Shipping Concern     256
Account Access       228
Refund Request       180
Product Inquiry      140
Service Complaint     50
Name: count, dtype: int64


In [8]:
# Check multi-label statistics
df['num_categories'] = df['categories'].apply(len)
print("\nNumber of categories per ticket:")
print(df['num_categories'].value_counts().sort_index())


Number of categories per ticket:
num_categories
1    582
2    414
3      4
Name: count, dtype: int64


In [10]:
# Convert to binary format for multi-label classification
mlb = MultiLabelBinarizer()
y_binary = mlb.fit_transform(df['categories'])

print(f"\nLabel classes: {mlb.classes_}")
print(f"Binary label shape: {y_binary.shape}")


Label classes: ['Account Access' 'Billing Issue' 'Product Inquiry' 'Refund Request'
 'Service Complaint' 'Shipping Concern' 'Technical Problem']
Binary label shape: (1000, 7)


In [11]:
# Split the data
X = df['text_clean'].values
y = y_binary

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.4, random_state = 42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 42)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")


Train: 600, Val: 200, Test: 200


In [12]:
# Save the clean data splits
np.save('../data/processed/X_train.npy', X_train)
np.save('../data/processed/X_val.npy', X_val)
np.save('../data/processed/X_test.npy', X_test)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/y_val.npy', y_val)
np.save('../data/processed/y_test.npy', y_test)

In [13]:
# Save the label encoder
import joblib
joblib.dump(mlb, '../data/processed/label_encoder.pkl')

['../data/processed/label_encoder.pkl']