## Tech Support Case Cleaning and Anonymization Pipeline

## 📦 1. Imports & Setup


In [None]:
import os
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import langid
from langdetect import detect, DetectorFactory
import spacy
from langdetect import detect
import langid
from spacy.language import Language
import en_core_web_sm
from spacy_langdetect import LanguageDetector
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
import openpyxl
from nltk.sentiment import SentimentIntensityAnalyzer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from scripts.utils import (
    load_or_convert_to_csv, save_processed, log_time, remove_exact_duplicates,
    is_english_langdetect, is_english_langid, detect_language_spacy,
    anonymize_text, replace_name_patterns, read_processed
)

## 📂 2. Load Dataset

In [None]:
# Load raw data
cases_df = load_or_convert_to_csv("cases")  # auto loads from data/raw/

In [None]:
# Preview data
print("Initial shape:", cases_df.shape)
display(cases_df.head(5))

## 🧹 3. Initial Cleaning

In [None]:
start = time.time()
initial_count = len(cases_df)
cases_df = cases_df[cases_df['case_description'].notna() & (cases_df['case_description'].str.strip() != "")]
log_time(start, "Removed empty or null descriptions")
print(f"✅ Removed {initial_count - len(cases_df)} cases")

In [None]:
# Rename case description column
cases_df.rename(columns={"case_description": "description"}, inplace=True)

In [None]:
# Remove duplicates based on case_number or case_description
cases_df, removed_cases = remove_exact_duplicates(
    cases_df,
    subset=["case_number", "description"],
    save_prefix="cases"
)

### Drop duplicate case_numbers if found

# 🔒 4. Anonymization

## 🧪 Apply Custom Name Replacement and spacy anonymization

In [None]:
# Apply the replacement and the spacy anonymization to create a final anonymized version
start = time.time()
cases_df["description"] = cases_df["description"].apply(anonymize_text)
cases_df["description"] = cases_df["description"].apply(replace_name_patterns)
elapsed = time.time() - start
print(f"✅ Applied name pattern replacement and spacy anonymization (Time: {elapsed:.2f} seconds)")

## 🌍 5. Language Detection Comparison

In [None]:
# Language detection
cases_df["langdetect_is_en"] = cases_df["description"].apply(is_english_langdetect)
cases_df["langid_is_en"] = cases_df["description"].apply(is_english_langid)
cases_df["spacy_lang"] = cases_df["description"].apply(detect_language_spacy)

# Compare disagreements
cases_df[cases_df["langdetect_is_en"] != cases_df["langid_is_en"]][
    ["description", "langdetect_is_en", "langid_is_en"]
]

In [None]:
# filter cases where is _english_spacy is en OR is_english_langid is True to keep only english cases
cases_df = cases_df[(cases_df["spacy_lang"] == "en") | (cases_df["langid_is_en"] == True)]


In [None]:
# show the information of the dataframe
print(cases_df.columns)

In [None]:
# Drop the extra columns
cases_df.drop(columns=["langdetect_is_en", "langid_is_en", "spacy_lang"], inplace=True)

In [None]:
# Save the processed data
save_processed(cases_df, "cases.csv")