In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from imblearn.over_sampling import SMOTE

In [5]:
print("Loading data...")

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

combined_df = pd.concat([train_df, test_df], ignore_index=True)

Loading data...


In [6]:
print("Removing rare classes...")

min_instances=2

category_counts = combined_df['category'].value_counts()
sub_category_counts = combined_df['sub_category'].value_counts()

valid_categories = category_counts[category_counts >= min_instances].index
filtered_df = combined_df[combined_df['category'].isin(valid_categories)].copy()

valid_sub_categories = sub_category_counts[sub_category_counts >= min_instances].index
filtered_df = filtered_df[filtered_df['sub_category'].isin(valid_sub_categories)].copy()

print("Number of removed categories:", len(category_counts) - len(valid_categories))
print("Number of removed sub categories:", len(sub_category_counts) - len(valid_sub_categories))

Removing rare classes...
Number of removed categories: 1
Number of removed sub categories: 3


In [7]:
def clean_text(text):
    if pd.isna(text):
        return ''
    
    text = str(text).lower()
    
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    
    text = ' '.join(text.split())
    
    return text.strip()

print("Cleaning data...")

cleaned_df = filtered_df.copy()
cleaned_df.drop_duplicates(inplace=True)

cleaned_df['category'] = cleaned_df['category'].apply(clean_text)
cleaned_df['sub_category'] = cleaned_df['sub_category'].apply(clean_text)
cleaned_df['crimeaditionalinfo'] = cleaned_df['crimeaditionalinfo'].apply(clean_text)

cleaned_df.dropna(inplace=True)

print("Instances removed due to duplicates:", len(combined_df) - len(cleaned_df))

Cleaning data...
Instances removed due to duplicates: 16072
