In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re

In [51]:
print("Loading data...")

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

train_df['source'] = 'train'
test_df['source'] = 'test'

combined_df = pd.concat([train_df, test_df], ignore_index=True)

Loading data...


In [52]:
def clean_text(text):
    if pd.isna(text):
        return ''
    
    text = str(text).lower()
    
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    
    text = ' '.join(text.split())
    
    return text.strip()

print("Cleaning data...")

cleaned_df = combined_df.copy()
cleaned_df.drop_duplicates(inplace=True)

cleaned_df['category'] = cleaned_df['category'].apply(clean_text)
cleaned_df['sub_category'] = cleaned_df['sub_category'].apply(clean_text)
cleaned_df['crimeaditionalinfo'] = cleaned_df['crimeaditionalinfo'].apply(clean_text)

cleaned_df = cleaned_df.fillna('')

Cleaning data...


In [53]:
print("Removing rare classes...")

min_instances=2

category_counts = cleaned_df['category'].value_counts()
valid_categories = category_counts[category_counts >= min_instances].index
filtered_df = cleaned_df[cleaned_df['category'].isin(valid_categories)].copy()

print("Number of removed categories:", len(category_counts) - len(valid_categories))

Removing rare classes...
Number of removed categories: 1


In [54]:
print("Splitting data...")

test_size=0.2 

processed_train_df, processed_test_df = train_test_split(
    filtered_df,
    test_size=test_size,
    random_state=42,
    stratify=filtered_df['category']
)

Splitting data...


In [55]:
print("Saving processed data...")
processed_train_df.to_csv(f"../data/processed_train.csv", index=False)
processed_test_df.to_csv(f"../data/processed_test.csv", index=False)

Saving processed data...


In [56]:
print("\nData Processing Summary:")
print(f"Original number of samples: {len(combined_df)}")
print(f"Number of samples after cleaning: {len(cleaned_df)}")
print(f"Number of samples after filtering rare classes: {len(filtered_df)}")
print(f"Number of unique categories: {len(filtered_df['category'].unique())}")
print(f"Training set size: {len(processed_train_df)}")
print(f"Test set size: {len(processed_test_df)}")


Data Processing Summary:
Original number of samples: 124915
Number of samples after cleaning: 114958
Number of samples after filtering rare classes: 114957
Number of unique categories: 15
Training set size: 91965
Test set size: 22992
