## Tech Support Case Cleaning and Anonymization Pipeline

## 📦 1. Imports & Setup


In [None]:
import os
import pandas as pd
import numpy as np
import re
import time
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import langid
from langdetect import detect, DetectorFactory
import spacy
from langdetect import detect
import langid
from spacy.language import Language
import en_core_web_sm
from spacy_langdetect import LanguageDetector
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
import openpyxl
from nltk.sentiment import SentimentIntensityAnalyzer
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from scripts.utils import (
    load_or_convert_to_csv, save_processed, log_time, remove_exact_duplicates,
    is_english_langdetect, is_english_langid, detect_language_spacy,
    anonymize_text, replace_name_patterns, read_processed
)

## 📂 2. Load Dataset

In [None]:
# Load raw data
comments_df = load_or_convert_to_csv("comments")

In [None]:
# show the info of the dataframe
comments_df.info()

In [None]:
# Preview data
print("Initial shape:", comments_df.shape)
display(comments_df.head(5))

## 🧹 3. Initial Cleaning

In [None]:
start = time.time()
initial_count = len(comments_df)
comments_df = comments_df[comments_df['message_body'].notna() & (comments_df['message_body'].str.strip() != "")]
log_time(start, "Removed empty or null comments")
print(f"✅ Removed {initial_count - len(comments_df)} comments")

In [None]:
comments_df, removed_comments = remove_exact_duplicates(
    comments_df,
    subset=["case_number", "message_body"],
    save_prefix="comments"
)

# 🔒 Anonymization

In [None]:
# Step 1: Apply spaCy-based entity anonymization
start = time.time()
comments_df["message_body"] = comments_df["message_body"].apply(anonymize_text)
log_time(start, "✅ Entity-based anonymization applied")

# Step 2: Apply rule-based name pattern anonymization
start = time.time()
comments_df["message_body"] = comments_df["message_body"].apply(replace_name_patterns)
log_time(start, "✅ Name patterns replaced")

## 🌍 5. Language Detection Comparison

In [None]:
# 📌 Apply all three language detection methods
comments_df["is_english_langdetect"] = comments_df["message_body"].apply(is_english_langdetect)
comments_df["is_english_langid"] = comments_df["message_body"].apply(is_english_langid)
comments_df["lang_spacy"] = comments_df["message_body"].apply(detect_language_spacy)


In [None]:
# ✅ Inspect disagreement
pd.set_option("display.max_colwidth", 150)

# Messages where langid says NOT English but spaCy says English
display(comments_df[
    (comments_df["lang_spacy"] == "en") & (comments_df["is_english_langid"] == False)
][["message_body", "lang_spacy", "is_english_langid"]].head(30))

In [None]:
# Messages where langid says English but spaCy does NOT
display(comments_df[
    (comments_df["lang_spacy"] != "en") & (comments_df["is_english_langid"] == True)
][["message_body", "lang_spacy", "is_english_langid"]].head(30))

# 🧮 Stats: agreement/disagreement
print("🧮 Disagreement rate (spaCy says EN, langid says not):",
      comments_df[(comments_df["lang_spacy"] == "en") & (comments_df["is_english_langid"] == False)].shape[0] / comments_df.shape[0])

print("🧮 Agreement rate (both say EN):",
      comments_df[(comments_df["lang_spacy"] == "en") & (comments_df["is_english_langid"] == True)].shape[0] / comments_df.shape[0])

In [None]:
# ✅ Final filtering: Keep if either spaCy or langid says English
comments_df = comments_df[
    (comments_df["lang_spacy"] == "en") | (comments_df["is_english_langid"] == True)
]

In [None]:
# Drop unnecessary columns
comments_df.drop(columns=["is_english_langdetect", "is_english_langid", "lang_spacy"], inplace=True)

In [None]:
# show the info of the dataframe
comments_df.info()

In [None]:
# Save processed data
save_processed(comments_df, "comments.csv")