In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix

In [2]:
print("Loading data...")

train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

combined_df = pd.concat([train_df, test_df], ignore_index=True)

Loading data...


In [3]:
print("Removing rare classes...")

min_instances=5

category_counts = combined_df['category'].value_counts()
sub_category_counts = combined_df['sub_category'].value_counts()

valid_categories = category_counts[category_counts >= min_instances].index
filtered_df = combined_df[combined_df['category'].isin(valid_categories)].copy()

valid_sub_categories = sub_category_counts[sub_category_counts >= min_instances].index
filtered_df = filtered_df[filtered_df['sub_category'].isin(valid_sub_categories)].copy()

print("Number of removed categories:", len(category_counts) - len(valid_categories))
print("Number of removed sub categories:", len(sub_category_counts) - len(valid_sub_categories))
print("Instances removed:", len(combined_df) - len(filtered_df))

Removing rare classes...
Number of removed categories: 2
Number of removed sub categories: 4
Instances removed: 8832


In [4]:
def clean_text(text):
    if pd.isna(text):
        return ''
    
    text = str(text).lower()
    
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    
    text = ' '.join(text.split())
    
    return text.strip()

print("Cleaning data...")

cleaned_df = filtered_df.copy()
cleaned_df.drop_duplicates(inplace=True)

cleaned_df['category'] = cleaned_df['category'].apply(clean_text)
cleaned_df['sub_category'] = cleaned_df['sub_category'].apply(clean_text)
cleaned_df['crimeaditionalinfo'] = cleaned_df['crimeaditionalinfo'].apply(clean_text)

cleaned_df.dropna(inplace=True)

print("Instances removed due to duplicates:", len(filtered_df) - len(cleaned_df))

Cleaning data...
Instances removed due to duplicates: 7242


In [5]:
print("Final Data Distribution")

print(cleaned_df.describe(), end="\n\n")
print("Total instances removed: ", len(combined_df) - len(cleaned_df))
print("Final number of instances: ", len(cleaned_df), f"({cleaned_df.shape[0]/combined_df.shape[0] * 100:.2f}%)")

Final Data Distribution
                      category        sub_category crimeaditionalinfo
count                   108841              108841             108841
unique                      10                  34             100076
top     online financial fraud  upi related frauds                   
freq                     69302               31055               1052

Total instances removed:  16074
Final number of instances:  108841 (87.13%)


In [6]:
cleaned_df['category_sub_category'] = cleaned_df['category'].astype(str) + '_' + cleaned_df['sub_category'].astype(str)

train, test_df = train_test_split(
    cleaned_df,
    test_size=0.2,
    random_state=42,
    stratify=cleaned_df['category_sub_category']
)

X = train['crimeaditionalinfo']
y = train['category_sub_category']

tfidf = TfidfVectorizer(max_features=1000)
X = tfidf.fit_transform(X)

In [7]:
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=5)
X, y = smote.fit_resample(X, y)

In [8]:
X_df = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())
resampled_labels = pd.DataFrame(y, columns=['category_sub_category'])
resampled_labels[['category', 'sub_category']] = resampled_labels['category_sub_category'].str.split('_', expand=True)
final_df = pd.concat([resampled_labels[['category', 'sub_category']], X_df], axis=1)
final_df['crimeaditionalinfo'] = X_df.apply(lambda row: ' '.join([word for word, val in zip(tfidf.get_feature_names_out(), row) if val > 0]), axis=1)

final_df = final_df[['category', 'sub_category', 'crimeaditionalinfo']]
print(final_df.head())

                                category                    sub_category  \
0  online and social media related crime       cheating by impersonation   
1                 online financial fraud              upi related frauds   
2                 online financial fraud              upi related frauds   
3  online and social media related crime  profile hacking identity theft   
4                 online financial fraud              upi related frauds   

                                  crimeaditionalinfo  
0  after and as attached call calls for he is mes...  
1  address after and be bola call came click clic...  
2  account an done from happened is my of rupees ...  
3  about all and back blackmailing can changed cr...  
4  account action also am amount and bank called ...  


In [9]:
final_df.to_csv('../data/resampled_data.csv', index=False)

In [10]:
final_df.isnull().sum()

category              0
sub_category          0
crimeaditionalinfo    0
dtype: int64

In [14]:
X_train = final_df['crimeaditionalinfo']
X_test = test_df['crimeaditionalinfo']

y_train = final_df[['category', 'sub_category']]
y_test = test_df[['category', 'sub_category']]

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Text feature extraction
    ('clf', MultiOutputClassifier(RandomForestClassifier(random_state=42)))  # Multi-output classifier for both targets
])

In [15]:
pipeline.fit(X_train, y_train)

In [16]:
y_pred = pipeline.predict(X_test)

In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Separate the predictions into individual arrays for each target variable
y_pred_category, y_pred_sub_category = y_pred[:, 0], y_pred[:, 1]
y_test_category, y_test_sub_category = y_test['category'], y_test['sub_category']

# Calculate metrics for each target separately
metrics = {
    "Category": {
        "Accuracy": accuracy_score(y_test_category, y_pred_category),
        "Precision": precision_score(y_test_category, y_pred_category, average="macro"),
        "Recall": recall_score(y_test_category, y_pred_category, average="macro"),
        "F1 Score": f1_score(y_test_category, y_pred_category, average="macro"),
    },
    "Sub-category": {
        "Accuracy": accuracy_score(y_test_sub_category, y_pred_sub_category),
        "Precision": precision_score(y_test_sub_category, y_pred_sub_category, average="macro"),
        "Recall": recall_score(y_test_sub_category, y_pred_sub_category, average="macro"),
        "F1 Score": f1_score(y_test_sub_category, y_pred_sub_category, average="macro"),
    }
}

print("Category Metrics:")
for metric, value in metrics["Category"].items():
    print(f"{metric}: {value}")

print("\nSub-category Metrics:")
for metric, value in metrics["Sub-category"].items():
    print(f"{metric}: {value}")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Category Metrics:
Accuracy: 0.759704166475263
Precision: 0.42691248791642933
Recall: 0.27867497085999443
F1 Score: 0.28214050716694783

Sub-category Metrics:
Accuracy: 0.5045707198309523
Precision: 0.23696466156115167
Recall: 0.16696301428184704
F1 Score: 0.17707901162234507


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
import numpy as np

y_test_array = y_test.to_numpy()
y_pred_array = np.array(y_pred)

joint_accuracy = np.mean(np.all(y_test_array == y_pred_array, axis=1))

print(f"Joint Accuracy: {joint_accuracy}")

Joint Accuracy: 0.44462308787725663
