In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('crm.csv')

In [None]:
df = df[df['System'] == 'PGW']

In [None]:
# Convert all values in 'Subject' column to lowercase
df['Subject'] = df['Subject'].str.lower()

print("Subject column after converting to lowercase:")
print(df['Subject'].head(10))


In [None]:
# Remove all instances of 'samsung' from the 'Subject' column
df['Subject'] = df['Subject'].str.replace('samsung', '', regex=False)

print("Subject column after removing 'samsung':")
print(df['Subject'].head(10))


In [None]:
# Replace substrings 'sae', 'saegw', 'saes', 'sae-gw', 'saeegws' with 'pgw'
substrings_to_replace = ['sae', 'saegw', 'saes', 'sae-gw', 'saegws']

for substring in substrings_to_replace:
    df['Subject'] = df['Subject'].str.replace(substring, 'pgw', regex=False)

print("Subject column after replacing substrings:")
print(df['Subject'].head(10))


In [None]:
# Remove all instances of 'samsung' from the 'Subject' column
df['Subject'] = df['Subject'].str.replace('pgwgw', 'pgw', regex=False)

print("Subject column after removing 'samsung':")
print(df['Subject'].head(10))

In [None]:
df.to_csv('PGW_crm.csv', index=False)

In [None]:
# Normalize any word containing 'pgw' to exactly 'pgw' in the 'Subject' column
# Ensure Subject is string
df['Subject'] = df['Subject'].astype(str)

# Replace any word that contains 'pgw' with 'pgw'
df['Subject'] = df['Subject'].str.replace(r"\b\w*pgw\w*\b", 'pgw', regex=True)

# Collapse multiple spaces and trim
df['Subject'] = df['Subject'].str.replace(r'\s+', ' ', regex=True).str.strip()

print("Subject column after normalizing 'pgw' words (sample 20):")
print(df['Subject'].head(20).to_string(index=False))

# Optional: show count of rows that now contain 'pgw'
pgw_count = df['Subject'].str.contains(r"\bpgw\b").sum()
print(f"\nRows containing 'pgw': {pgw_count}")


In [None]:
df.head()

In [None]:
import string

# Ensure Subject is string
df['Subject'] = df['Subject'].astype(str)

# Remove all punctuation characters using str.translate and string.punctuation
translator = str.maketrans('', '', string.punctuation)
df['Subject'] = df['Subject'].str.translate(translator)

# Collapse multiple spaces and trim
df['Subject'] = df['Subject'].str.replace(r'\s+', ' ', regex=True).str.strip()

print("Subject column after removing punctuation (sample 20):")
print(df['Subject'].head(20).to_string(index=False))


In [None]:
# Expand contractions in the 'Subject' column using the `contractions` package
# This cell will try to import contractions and install it if missing.
import contractions


# Ensure Subject is string-typed
df['Subject'] = df['Subject'].astype(str)

# Apply expansion
df['Subject'] = df['Subject'].apply(lambda s: contractions.fix(s))

# Collapse extra spaces and trim
df['Subject'] = df['Subject'].str.replace(r"\s+", ' ', regex=True).str.strip()

print("Sample of 'Subject' after expanding contractions:")
print(df['Subject'].head(20).to_string(index=False))


In [None]:
# Spell-check Subject column using pyspellchecker
# Installs pyspellchecker if missing, then finds misspelled words per row.
import re
from spellchecker import SpellChecker

spell = SpellChecker()

def find_misspellings(s):
    if not isinstance(s, str) or not s.strip():
        return []
    # extract words (keep apostrophes inside words)
    words = re.findall(r"[A-Za-z']+", s)
    # lowercase and filter short tokens
    words = [w.lower() for w in words if len(w) > 1]
    if not words:
        return []
    miss = spell.unknown(words)
    return sorted(miss)

# Apply
df['subject_misspellings'] = df['Subject'].apply(find_misspellings)

# Summary
from collections import Counter
all_miss = [w for lst in df['subject_misspellings'] for w in lst]
miss_counts = Counter(all_miss)

print('Total rows checked:', len(df))
print('Rows with >=1 misspelling:', (df['subject_misspellings'].str.len() > 0).sum())
print('\nTop misspelled tokens (up to 50):')
for w,c in miss_counts.most_common(50):
    print(f"{w}: {c}")

# Show sample rows with misspellings
print('\nSample rows with misspellings (first 20):')
rows = df[df['subject_misspellings'].str.len() > 0]
for idx, row in rows.head(20).iterrows():
    print('---')
    print('Index:', idx)
    print('Subject:', row['Subject'])
    print('Misspellings:', row['subject_misspellings'])

# Keep the column for further inspection
print('\nAdded column: subject_misspellings')


In [None]:
# Interactive spelling correction for `df['Subject']`
# Prompts you to accept a suggested correction, skip, or enter a custom replacement.
# Runs in a notebook cell (expects interactive input()).
import re
from spellchecker import SpellChecker
spell = SpellChecker()

# Helper to find candidate misspellings for a single subject
word_re = re.compile(r"[A-Za-z']+")

def words_from_text(s):
    return [w for w in re.findall(word_re, s)]

# Ensure Subject is string
df['Subject'] = df['Subject'].astype(str)

# Iterate rows and prompt for corrections
print('Interactive spelling correction starting. For each misspelled token you will be prompted.')
print("Instructions: press Enter to accept suggested correction, type 's' to skip, type 'r' to provide replacement.")

rows_with_issues = []
for idx, row in df.iterrows():
    subj = row['Subject']
    words = words_from_text(subj)
    # lowercase words for spellchecking but keep original for replacements
    candidates = [w for w in words if len(w) > 1]
    if not candidates:
        continue
    miss = spell.unknown([w.lower() for w in candidates])
    if not miss:
        continue

    # We have misspellings in this row
    changed = False
    subj_new = subj
    print('\n' + '='*60)
    print(f'Row index: {idx}')
    print('Original Subject:')
    print(subj)

    for w in sorted(set(candidates), key=lambda x: x.lower()):
        w_l = w.lower()
        if w_l not in miss:
            continue
        suggestion = spell.correction(w_l) or w
        suggestions = spell.candidates(w_l)
        print('\nMisspelled token: "' + w + '"')
        print('Suggested correction:', suggestion)
        if suggestions:
            print('Other candidates:', ', '.join(sorted(suggestions)))
        print('Context:')
        # show short context around the token
        context = re.sub(r"\b" + re.escape(w) + r"\b", f">>{w}<<", subj)
        print(context)

        resp = input("Action ([Enter]=accept, s=skip, r=replace): ").strip()
        if resp == 's':
            print('Skipped')
            continue
        elif resp == 'r':
            repl = input('Enter replacement: ').strip()
            if not repl:
                print('Empty replacement, skipped')
                continue
            replacement = repl
        else:
            # default accept suggestion
            replacement = suggestion

        # Replace whole-word occurrences of the token (case-sensitive replacement preserving case as best effort)
        # We'll replace exact matches of the token (case-sensitive) and also lowercase matches
        subj_new = re.sub(r"\b" + re.escape(w) + r"\b", replacement, subj_new)
        subj_new = re.sub(r"\b" + re.escape(w_l) + r"\b", replacement, subj_new, flags=re.IGNORECASE)
        changed = True
        print(f'Applied replacement: {replacement}')

    if changed:
        print('Updated Subject:')
        print(subj_new)
        apply_resp = input('Apply these changes to this row? (y/N): ').strip().lower()
        if apply_resp == 'y':
            df.at[idx, 'Subject'] = subj_new
            rows_with_issues.append(idx)
            print('Change applied.')
        else:
            print('Change discarded.')

            

print('\nInteractive correction finished.')
print(f'Rows updated: {len(rows_with_issues)}')
print('You may want to save the notebook to persist df changes.')


In [None]:
df = pd.read_csv('PGW_crm.csv')
df.head()

In [None]:
# Remove English stopwords from `Subject` using NLTK
# Installs NLTK if missing, downloads stopwords, and strips stopwords from each Subject.
import re
import nltk
from nltk.corpus import stopwords


# Ensure stopwords are downloaded
nltk.download('stopwords', quiet=True)
stop = set(stopwords.words('english'))

# Helper to remove stopwords while preserving words
word_re = re.compile(r"\w+")

def remove_stopwords_subject(s):
    if not isinstance(s, str) or not s.strip():
        return s
    tokens = word_re.findall(s)
    filtered = [t for t in tokens if t.lower() not in stop]
    return ' '.join(filtered)

# Ensure Subject is string
df['Subject'] = df['Subject'].astype(str)
# Apply removal
original_nonempty = (df['Subject'].str.strip() != '').sum()
df['Subject'] = df['Subject'].apply(remove_stopwords_subject)

print(f"Applied stopword removal to {original_nonempty} non-empty Subject rows.")
print('Sample after stopword removal:')
print(df['Subject'].head(20).to_string(index=False))
