### Text Cleaning Function

This notebook contains a helpful function to strip duplicated sentences and normalise unstructured text data as a preparation step for loading into a LLM.

I will be populating a dataframe of customer product ratings at random to use for testing the cleaning functions.

In [1]:
import pandas as pd
import re
import random
from datetime import datetime, timedelta
import math
from collections import Counter

In [8]:
# Helper function to generate a random 5-digit ID
def generate_id():
    return str(random.randint(10000, 99999))

# Sample sentences to use in transcripts
sentences = [
    "The product exceeded my expectations!",
    "Delivery was prompt and hassle-free, how do I order more?",
    "I found the quality to be outstanding.",
    "Customer service was very helpful.",
    "I would definitely recommend this to others.",
    "The packaging was neat and secure.",
    "It works exactly as described.",
    "Value for money is excellent.",
    "Setup was quick and easy.",
    "I'm very satisfied with my purchase.",
    "This was by far the worst customer experience I have had!",
    "I have tried to return this product on numerous occasions but was declined!",
    "Why would anyone suggest this product for use?"
]

# Function to create a transcript with repeated sentences
def create_transcript():
    selected = random.sample(sentences, k=random.randint(3, 5))
    repeated = [random.choice(selected) for _ in range(random.randint(1, 3))]
    all_sentences = selected + repeated
    random.shuffle(all_sentences)
    transcript = " ".join(all_sentences)
    return " ".join(transcript.split()[:100])  # Ensure max 100 words

# Generate the DataFrame
num_rows = 2000
start_date = datetime.today()

data = {
    "date": [(start_date - timedelta(days=i)).strftime("%Y-%m-%d") for i in range(num_rows)],
    "id": [generate_id() for _ in range(num_rows)],
    "transcript": [create_transcript() for _ in range(num_rows)]
}

df = pd.DataFrame(data)
print(df)

            date     id                                         transcript
0     2025-08-24  91846  Setup was quick and easy. Setup was quick and ...
1     2025-08-23  29186  This was by far the worst customer experience ...
2     2025-08-22  37302  The product exceeded my expectations! I found ...
3     2025-08-21  94733  I found the quality to be outstanding. I found...
4     2025-08-20  20467  I found the quality to be outstanding. I have ...
...          ...    ...                                                ...
1995  2020-03-08  35717  Value for money is excellent. I'm very satisfi...
1996  2020-03-07  87191  I found the quality to be outstanding. I found...
1997  2020-03-06  16364  It works exactly as described. It works exactl...
1998  2020-03-05  64005  It works exactly as described. I found the qua...
1999  2020-03-04  60862  Setup was quick and easy. It works exactly as ...

[2000 rows x 3 columns]


In [9]:
# Taking a meaningful sample:

# Parameters
Z = 1.96  # 95% confidence
p = 0.5   # conservative estimate
E = 0.05  # ±5% margin of error
N = len(df)  # total output length

# Standard Sample Size Equation - assumes a VERY LARGE or finite population:
n = (Z**2 * p * (1 - p)) / (E**2)

# Adjusted for finite population by reducing n and maintaining significance:
n_adj = math.ceil(n / (1 + ((n - 1) / N)))

print(f"Recommended sample size: {n_adj}")
print(f"Recommended sample size: {n}")

Recommended sample size: 323
Recommended sample size: 384.1599999999999


In [10]:
# Take a simple random sample:
random_sample = df.sample(n=n_adj, random_state=42)

# Save or inspect the sample:
print(random_sample.head())

# confirm sample size:
print(len(random_sample))

            date     id                                         transcript
1860  2020-07-21  50197  It works exactly as described. Setup was quick...
353   2024-09-05  71216  This was by far the worst customer experience ...
1333  2021-12-30  54326  Why would anyone suggest this product for use?...
905   2023-03-03  53918  Why would anyone suggest this product for use?...
1289  2022-02-12  77853  I found the quality to be outstanding. The pro...
323


In [11]:
# check for dupes:
random_sample['UID'] =  df['date'].astype(str) + '_' + df['id'].astype(str)

# print(random_sample.head())

# Count how many times each UID appears
uid_counts = random_sample['UID'].value_counts()

# Filter to show only duplicates (i.e., count > 1)
duplicates = uid_counts[uid_counts > 1]

# Display the result
print(duplicates)

Series([], Name: count, dtype: int64)


In [12]:
# Lambda:

# create new column for the cleaned original data (cleaned_summary):
random_sample['cleaned_text'] = random_sample['transcript'].apply(

    lambda x: '.\n'.join([s for s in pd.unique([s.strip().lower() for s in re.split(r'[.!?]\s+', x)]) if s])

)

random_sample.head()

  lambda x: '.\n'.join([s for s in pd.unique([s.strip().lower() for s in re.split(r'[.!?]\s+', x)]) if s])


Unnamed: 0,date,id,transcript,UID,cleaned_text
1860,2020-07-21,50197,It works exactly as described. Setup was quick...,2020-07-21_50197,it works exactly as described.\nsetup was quic...
353,2024-09-05,71216,This was by far the worst customer experience ...,2024-09-05_71216,this was by far the worst customer experience ...
1333,2021-12-30,54326,Why would anyone suggest this product for use?...,2021-12-30_54326,why would anyone suggest this product for use....
905,2023-03-03,53918,Why would anyone suggest this product for use?...,2023-03-03_53918,why would anyone suggest this product for use....
1289,2022-02-12,77853,I found the quality to be outstanding. The pro...,2022-02-12_77853,i found the quality to be outstanding.\nthe pr...


In [6]:
# Long form function which will provide the output of what was extracted

def extract_sentences_and_update_df(df, column_name):

    # Regex to split sentences based on punctuation followed by whitespace

    sentence_endings = re.compile(r"""

        (?<=        # look behind: match if preceded by.

        [.!?]       # Sentence ending punctuation.

        )\s+        # Followed by one or more whitespace characters.

        """, re.X)

 

    all_original_sentences = []

    all_cleaned_sentences = []

 

    # Collect all sentences across the column

    for paragraph in df[column_name].dropna():

        sentences = sentence_endings.split(paragraph.strip())

        for sentence in sentences:

            if sentence.strip():

                cleaned = re.sub(r'\W+', '', sentence.lower())

                all_original_sentences.append(sentence.strip())

                all_cleaned_sentences.append(cleaned)

 

    # Identify duplicates

    sentence_counts = Counter(all_cleaned_sentences)

    duplicate_cleaned = {s for s, count in sentence_counts.items() if count > 1}

 

    # Map cleaned to original sentence

    cleaned_to_original = {}

    for orig, clean in zip(all_original_sentences, all_cleaned_sentences):

        if clean in duplicate_cleaned and clean not in cleaned_to_original:

            cleaned_to_original[clean] = orig

 

    # Create new columns for each row

    duplicate_sentences_column = []

    texts_without_duplicates_column = []

 

    for paragraph in df[column_name]:

        if pd.isna(paragraph):

            duplicate_sentences_column.append("")

            texts_without_duplicates_column.append("")

            continue

 

        sentences = sentence_endings.split(paragraph.strip())

        filtered_sentences = []

        duplicates_in_row = []

        seen_cleaned = set()

 

        for sentence in sentences:

            cleaned = re.sub(r'\W+', '', sentence.lower())

            if cleaned in duplicate_cleaned:

                if cleaned not in seen_cleaned:

                    filtered_sentences.append(sentence.strip())  # keep first occurrence

                    seen_cleaned.add(cleaned)

                else:

                    duplicates_in_row.append(sentence.strip())  # subsequent duplicates

            else:

                filtered_sentences.append(sentence.strip())

        # append to new line - \n should not be present in the output to .csv        

        duplicate_sentences_column.append('\n'.join(duplicates_in_row))

        texts_without_duplicates_column.append('\n'.join(filtered_sentences))

 

    # Add new columns to the DataFrame

    df['duplicate_sentences'] = duplicate_sentences_column

    df['text_without_duplicates'] = texts_without_duplicates_column

 

    return df

 

# Apply the function

text_summary_df = extract_sentences_and_update_df(random_sample, 'transcript')

text_summary_df

Unnamed: 0,date,id,transcript,UID,duplicate_sentences,text_without_duplicates
1860,2020-07-21,85914,Value for money is excellent. Delivery was pro...,2020-07-21_85914,Value for money is excellent.\nDelivery was pr...,Value for money is excellent.\nDelivery was pr...
353,2024-09-05,33304,Value for money is excellent. The product exce...,2024-09-05_33304,The product exceeded my expectations!\nThe pro...,Value for money is excellent.\nThe product exc...
1333,2021-12-30,88122,I found the quality to be outstanding. Deliver...,2021-12-30_88122,I found the quality to be outstanding.\nDelive...,I found the quality to be outstanding.\nDelive...
905,2023-03-03,55857,I have tried to return this product on numerou...,2023-03-03_55857,Why would anyone suggest this product for use?,I have tried to return this product on numerou...
1289,2022-02-12,31244,Value for money is excellent. Value for money ...,2022-02-12_31244,Value for money is excellent.\nWhy would anyon...,Value for money is excellent.\nWhy would anyon...
...,...,...,...,...,...,...
1810,2020-09-09,78893,The packaging was neat and secure. Customer se...,2020-09-09_78893,The packaging was neat and secure.,The packaging was neat and secure.\nCustomer s...
1573,2021-05-04,56872,Value for money is excellent. Value for money ...,2021-05-04_56872,Value for money is excellent.\nSetup was quick...,Value for money is excellent.\nI have tried to...
44,2025-07-11,35104,The product exceeded my expectations! The prod...,2025-07-11_35104,The product exceeded my expectations!\nI would...,The product exceeded my expectations!\nWhy wou...
1686,2021-01-11,23928,Why would anyone suggest this product for use?...,2021-01-11_23928,Why would anyone suggest this product for use?...,Why would anyone suggest this product for use?...
