In [3]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import time

In [4]:
train_dataset = pd.read_csv("resources/train.csv",encoding='ISO-8859-1')
test_dataset = pd.read_csv("resources/test.csv", encoding='ISO-8859-1')


In [5]:
train_dataset = train_dataset.drop(['0','1467810369','Mon Apr 06 22:19:45 PDT 2009','NO_QUERY','_TheSpecialOne_'], axis = 1)

In [6]:
test_dataset = test_dataset.drop(['4','3','Mon May 11 03:17:40 UTC 2009','kindle2','tpryan'], axis = 1)

In [7]:
test_dataset.head()

Unnamed: 0,"@stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right."
0,Reading my kindle2... Love it... Lee childs i...
1,"Ok, first assesment of the #kindle2 ...it fuck..."
2,@kenburbary You'll love your Kindle2. I've had...
3,@mikefish Fair enough. But i have the Kindle2...
4,@richardebaker no. it is too big. I'm quite ha...


In [8]:
def clean_text(text):
    # Remove special characters, numbers, and extra spaces
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Remove special chars and numbers
    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces with a single space
    text = text.strip()  # Remove leading/trailing whitespaces
    return text

In [9]:
train_clean_texts = [clean_text(text) for text in train_dataset.iloc[:, 0]]
test_clean_texts = [clean_text(text) for text in test_dataset.iloc[:, 0]]


## Tokenized Dataset

In [10]:
train_tokenize_word = [word_tokenize(text) for text in train_clean_texts]
train_tokenize_sent = [sent_tokenize(text) for text in train_clean_texts]

In [11]:
test_tokenize_word = [word_tokenize(text) for text in test_clean_texts]
test_tokenize_sent = [sent_tokenize(text) for text in test_clean_texts]

In [12]:
train_tokenize_word[:5],train_tokenize_sent[:5]

([['is',
   'upset',
   'that',
   'he',
   'cant',
   'update',
   'his',
   'Facebook',
   'by',
   'texting',
   'it',
   'and',
   'might',
   'cry',
   'as',
   'a',
   'result',
   'School',
   'today',
   'also',
   'Blah'],
  ['Kenichan',
   'I',
   'dived',
   'many',
   'times',
   'for',
   'the',
   'ball',
   'Managed',
   'to',
   'save',
   'The',
   'rest',
   'go',
   'out',
   'of',
   'bounds'],
  ['my',
   'whole',
   'body',
   'feels',
   'itchy',
   'and',
   'like',
   'its',
   'on',
   'fire'],
  ['nationwideclass',
   'no',
   'its',
   'not',
   'behaving',
   'at',
   'all',
   'im',
   'mad',
   'why',
   'am',
   'i',
   'here',
   'because',
   'I',
   'cant',
   'see',
   'you',
   'all',
   'over',
   'there'],
  ['Kwesidei', 'not', 'the', 'whole', 'crew']],
 [['is upset that he cant update his Facebook by texting it and might cry as a result School today also Blah'],
  ['Kenichan I dived many times for the ball Managed to save The rest go out of bound

In [13]:
test_tokenize_word[:5],test_tokenize_sent[:5]

([['Reading',
   'my',
   'kindle',
   'Love',
   'it',
   'Lee',
   'childs',
   'is',
   'good',
   'read'],
  ['Ok',
   'first',
   'assesment',
   'of',
   'the',
   'kindle',
   'it',
   'fucking',
   'rocks'],
  ['kenburbary',
   'Youll',
   'love',
   'your',
   'Kindle',
   'Ive',
   'had',
   'mine',
   'for',
   'a',
   'few',
   'months',
   'and',
   'never',
   'looked',
   'back',
   'The',
   'new',
   'big',
   'one',
   'is',
   'huge',
   'No',
   'need',
   'for',
   'remorse'],
  ['mikefish',
   'Fair',
   'enough',
   'But',
   'i',
   'have',
   'the',
   'Kindle',
   'and',
   'I',
   'think',
   'its',
   'perfect'],
  ['richardebaker',
   'no',
   'it',
   'is',
   'too',
   'big',
   'Im',
   'quite',
   'happy',
   'with',
   'the',
   'Kindle']],
 [['Reading my kindle Love it Lee childs is good read'],
  ['Ok first assesment of the kindle it fucking rocks'],
  ['kenburbary Youll love your Kindle Ive had mine for a few months and never looked back The new big

In [14]:
stop_words = set(stopwords.words('english'))
train_tokenize_word_cleaned = [
    [word.lower() for word in text if word.lower() not in stop_words]
    for text in train_tokenize_word
]
train_tokenize_sent_cleaned = [
    [word.lower() for word in sent if word.lower() not in stop_words]
    for sent in train_tokenize_sent
]


In [15]:
test_tokenize_word_cleaned = [
    [word.lower() for word in text if word.lower() not in stop_words]
    for text in test_tokenize_word
]
test_tokenize_sent_cleaned = [
    [word.lower() for word in sent if word.lower() not in stop_words]
    for sent in test_tokenize_sent
]


In [16]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

tokenizer = RegexpTokenizer(r'\w+|[^\w\s]+')

train_tokenize_word_stemmed = [
    [stemmer.stem(word) for word in text] for text in train_tokenize_word_cleaned
]
train_tokenize_sent_stemmed = [
    [stemmer.stem(word) for word in text] for text in train_tokenize_sent_cleaned
]

train_tokenize_word_lemmatized = [
    [lemmatizer.lemmatize(word) for word in text] for text in train_tokenize_word_cleaned
]
train_tokenize_sent_lemmatized = [
    [lemmatizer.lemmatize(word) for word in text] for text in train_tokenize_sent_cleaned
]


test_tokenize_word_stemmed = [
    [stemmer.stem(word) for word in text] for text in test_tokenize_word_cleaned
]
test_tokenize_sent_stemmed = [
    [stemmer.stem(word) for word in text] for text in test_tokenize_sent_cleaned
]

test_tokenize_word_lemmatized = [
    [lemmatizer.lemmatize(word) for word in text] for text in test_tokenize_word_cleaned
]
test_tokenize_sent_lemmatized = [
    [lemmatizer.lemmatize(word) for word in text] for text in test_tokenize_sent_cleaned
]

In [17]:
import re

# Emoji detection function
def contains_emoji(s):
    emoji_pattern = re.compile(
        "[\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags (iOS)
        "\U00002700-\U000027BF"  # Dingbats
        "\U00002600-\U000026FF"  # Misc symbols
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols
        "\U0001FA70-\U0001FAFF"  # Extended Symbols
        "\U000025A0-\U000025FF"  # Geometric Shapes
        "]",
        flags=re.UNICODE
    )
    return bool(emoji_pattern.search(s))

# Function to scan a 2D list for emojis
def check_for_emojis(tokenized_data, label):
    for i, text in enumerate(tokenized_data):
        for word in text:
            if contains_emoji(word):
                print(f"Emoji found in {label} at sentence {i}: '{word}'")
                return True
    print(f"No emojis found in {label}.")
    return False

# Check train and test tokenized data
print("Checking for emojis in train and test data:")

# Stemming and Lemmatization
check_for_emojis(train_tokenize_word_stemmed, "train_tokenize_word_stemmed")
check_for_emojis(train_tokenize_sent_stemmed, "train_tokenize_sent_stemmed")
check_for_emojis(train_tokenize_word_lemmatized, "train_tokenize_word_lemmatized")
check_for_emojis(train_tokenize_sent_lemmatized, "train_tokenize_sent_lemmatized")

check_for_emojis(test_tokenize_word_stemmed, "test_tokenize_word_stemmed")
check_for_emojis(test_tokenize_sent_stemmed, "test_tokenize_sent_stemmed")
check_for_emojis(test_tokenize_word_lemmatized, "test_tokenize_word_lemmatized")
check_for_emojis(test_tokenize_sent_lemmatized, "test_tokenize_sent_lemmatized")


Checking for emojis in train and test data:
No emojis found in train_tokenize_word_stemmed.
No emojis found in train_tokenize_sent_stemmed.
No emojis found in train_tokenize_word_lemmatized.
No emojis found in train_tokenize_sent_lemmatized.
No emojis found in test_tokenize_word_stemmed.
No emojis found in test_tokenize_sent_stemmed.
No emojis found in test_tokenize_word_lemmatized.
No emojis found in test_tokenize_sent_lemmatized.


False

In [18]:
from itertools import chain

# Flatten each list of sentences into a single list of words
train_tokenize_sent_stemmed_flattened = list(chain.from_iterable(train_tokenize_sent_stemmed))
train_tokenize_sent_lemmatized_flattened = list(chain.from_iterable(train_tokenize_sent_lemmatized))
train_tokenize_word_clean_lemmatized_flattend = list(chain.from_iterable(train_tokenize_word_lemmatized))
train_tokenize_word_clean_stemmed_flattend = list(chain.from_iterable(train_tokenize_word_stemmed))


test_tokenize_sent_stemmed_flattened = list(chain.from_iterable(test_tokenize_sent_stemmed))
test_tokenize_sent_lemmatized_flattened = list(chain.from_iterable(test_tokenize_sent_lemmatized))
test_tokenize_word_clean_lemmatized_flattend = list(chain.from_iterable(test_tokenize_word_lemmatized))
test_tokenize_word_clean_stemmed_flattend = list(chain.from_iterable(test_tokenize_word_stemmed))



# 1.1 Average Sentences

In [19]:
def average_sentence_length(flattened_words, flattened_sentences):
    return len(flattened_words) / len(flattened_sentences) if len(flattened_sentences) > 0 else 0
datasets = {
    "Train Sentences Stemmed": (train_tokenize_word_clean_stemmed_flattend, train_tokenize_sent_stemmed_flattened),
    "Train Sentences Lemmatized": (train_tokenize_word_clean_lemmatized_flattend, train_tokenize_sent_lemmatized_flattened),
    "Test Sentences Stemmed": (test_tokenize_word_clean_stemmed_flattend, test_tokenize_sent_stemmed_flattened),
    "Test Sentences Lemmatized": (test_tokenize_word_clean_lemmatized_flattend, test_tokenize_sent_lemmatized_flattened),
}
for name, (flattened_words, flattened_sentences) in datasets.items():
    start_time = time.time()
    avg_length = average_sentence_length(flattened_words, flattened_sentences)
    end_time = time.time()  # End timing
    runtime = end_time - start_time  # Calculate runtime
    print(f"Average sentence length in {name}: {avg_length:.2f}")
    print(f"  Runtime: {runtime:.6f} seconds\n")


Average sentence length in Train Sentences Stemmed: 7.72
  Runtime: 0.000000 seconds

Average sentence length in Train Sentences Lemmatized: 7.72
  Runtime: 0.000000 seconds

Average sentence length in Test Sentences Stemmed: 8.49
  Runtime: 0.000000 seconds

Average sentence length in Test Sentences Lemmatized: 8.49
  Runtime: 0.000000 seconds



# 1.2 Word Sentence Count

In [35]:
start_time = time.time()
print(f"(Stemming Train set) word count: {len(train_tokenize_word_clean_stemmed_flattend)} , Sentences Count: {len(train_tokenize_sent_stemmed_flattened)}")
print(f"(Lemmatizing Train set) word count: {len(train_tokenize_word_clean_lemmatized_flattend)} , Sentences Count: {len(train_tokenize_sent_lemmatized_flattened)}")
print(f"(Stemming Test set) word count: {len(test_tokenize_word_clean_stemmed_flattend)} , Sentences Count: {len(test_tokenize_sent_stemmed_flattened)}")
print(f"(Lemmatizing Test set) word count: {len(test_tokenize_word_clean_lemmatized_flattend)} , Sentences Count: {len(test_tokenize_sent_lemmatized_flattened)}")
end_time = time.time()  # End timing
runtime = end_time - start_time  # Calculate runtime
print(f"  Runtime: {runtime:.6f} seconds\n")

(Stemming Train set) word count: 12344117 , Sentences Count: 1599962
(Lemmatizing Train set) word count: 12344117 , Sentences Count: 1599962
(Stemming Test set) word count: 4219 , Sentences Count: 497
(Lemmatizing Test set) word count: 4219 , Sentences Count: 497
  Runtime: 0.000000 seconds



# 1.3 Vocabulary size (number of unique words).

In [21]:
def unique_word_count(tokens):
    return len(set(tokens))

In [36]:
start_time = time.time()
train_unique_counts = {
    "train_tokenize_sent_stemmed_flattened": unique_word_count(train_tokenize_sent_stemmed_flattened),
    "train_tokenize_sent_lemmatized_flattened": unique_word_count(train_tokenize_sent_lemmatized_flattened),
    "train_tokenize_word_clean_lemmatized_flattend": unique_word_count(train_tokenize_word_clean_lemmatized_flattend),
    "train_tokenize_word_clean_stemmed_flattend": unique_word_count(train_tokenize_word_clean_stemmed_flattend)
}
test_unique_counts = {
    "test_tokenize_sent_stemmed_flattened": unique_word_count(test_tokenize_sent_stemmed_flattened),
    "test_tokenize_sent_lemmatized_flattened": unique_word_count(test_tokenize_sent_lemmatized_flattened),
    "test_tokenize_word_clean_lemmatized_flattend": unique_word_count(test_tokenize_word_clean_lemmatized_flattend),
    "test_tokenize_word_clean_stemmed_flattend": unique_word_count(test_tokenize_word_clean_stemmed_flattend)
}
end_time = time.time()  # End timing
runtime = end_time - start_time  # Calculate runtime
print(f"  Runtime: {runtime:.6f} seconds\n")

  Runtime: 1.710448 seconds



In [23]:
print("Train Unique Word Counts:")
for name, count in train_unique_counts.items():
    print(f"{name}: Unique Word Count = {count}")

print("\nTest Unique Word Counts:")
for name, count in test_unique_counts.items():
    print(f"{name}: Unique Word Count = {count}")

Train Unique Word Counts:
train_tokenize_sent_stemmed_flattened: Unique Word Count = 1566474
train_tokenize_sent_lemmatized_flattened: Unique Word Count = 1566849
train_tokenize_word_clean_lemmatized_flattend: Unique Word Count = 778982
train_tokenize_word_clean_stemmed_flattend: Unique Word Count = 727141

Test Unique Word Counts:
test_tokenize_sent_stemmed_flattened: Unique Word Count = 497
test_tokenize_sent_lemmatized_flattened: Unique Word Count = 497
test_tokenize_word_clean_lemmatized_flattend: Unique Word Count = 2035
test_tokenize_word_clean_stemmed_flattend: Unique Word Count = 1902


# 1.4 Max word length, avg. min/max sentence length.

In [24]:
def max_word_length(tokens):
    return max(len(word) for word in tokens) if tokens else 0

In [37]:
# Train maximum word length
train_max_lengths = {

    "train_tokenize_sent_stemmed_flattened": max_word_length(train_tokenize_sent_stemmed_flattened),
    "train_tokenize_sent_lemmatized_flattened": max_word_length(train_tokenize_sent_lemmatized_flattened),
    "train_tokenize_word_clean_lemmatized_flattend": max_word_length(train_tokenize_word_clean_lemmatized_flattend),
    "train_tokenize_word_clean_stemmed_flattend": max_word_length(train_tokenize_word_clean_stemmed_flattend)
}

# Test maximum word length
test_max_lengths = {
    "test_tokenize_sent_stemmed_flattened": max_word_length(test_tokenize_sent_stemmed_flattened),
    "test_tokenize_sent_lemmatized_flattened": max_word_length(test_tokenize_sent_lemmatized_flattened),
    "test_tokenize_word_clean_lemmatized_flattend": max_word_length(test_tokenize_word_clean_lemmatized_flattend),
    "test_tokenize_word_clean_stemmed_flattend": max_word_length(test_tokenize_word_clean_stemmed_flattend)
}
end_time = time.time()  # End timing
runtime = end_time - start_time  # Calculate runtime
print(f"  Runtime: {runtime:.6f} seconds\n")

  Runtime: 8.794518 seconds



In [26]:
print("Train Max Word Lengths:")
for name, length in train_max_lengths.items():
    print(f"{name}: Max Word Length = {length}")

print("\nTest Max Word Lengths:")
for name, length in test_max_lengths.items():
    print(f"{name}: Max Word Length = {length}")

Train Max Word Lengths:
train_tokenize_sent_stemmed_flattened: Max Word Length = 176
train_tokenize_sent_lemmatized_flattened: Max Word Length = 177
train_tokenize_word_clean_lemmatized_flattend: Max Word Length = 125
train_tokenize_word_clean_stemmed_flattend: Max Word Length = 123

Test Max Word Lengths:
test_tokenize_sent_stemmed_flattened: Max Word Length = 138
test_tokenize_sent_lemmatized_flattened: Max Word Length = 139
test_tokenize_word_clean_lemmatized_flattend: Max Word Length = 46
test_tokenize_word_clean_stemmed_flattend: Max Word Length = 46


In [27]:
def min_max_sentence_length(sentences):
    sentence_lengths = [len(sentence) for sentence in sentences]  # List of sentence lengths
    if sentence_lengths:
        return min(sentence_lengths), max(sentence_lengths)  # Min and max sentence lengths

In [28]:
start_time = time.time()
train_min_max_lengths = {
    "train_tokenize_sent_stemmed_flattened": min_max_sentence_length(train_tokenize_sent_stemmed_flattened),
    "train_tokenize_sent_lemmatized_flattened": min_max_sentence_length(train_tokenize_sent_lemmatized_flattened),
    "train_tokenize_word_clean_lemmatized_flattend": min_max_sentence_length(train_tokenize_word_clean_lemmatized_flattend),
    "train_tokenize_word_clean_stemmed_flattend": min_max_sentence_length(train_tokenize_word_clean_stemmed_flattend)
}

test_min_max_lengths = {
    "test_tokenize_sent_stemmed_flattened": min_max_sentence_length(test_tokenize_sent_stemmed_flattened),
    "test_tokenize_sent_lemmatized_flattened": min_max_sentence_length(test_tokenize_sent_lemmatized_flattened),
    "test_tokenize_word_clean_lemmatized_flattend": min_max_sentence_length(test_tokenize_word_clean_lemmatized_flattend),
    "test_tokenize_word_clean_stemmed_flattend": min_max_sentence_length(test_tokenize_word_clean_stemmed_flattend)
}
end_time = time.time()  # End timing
runtime = end_time - start_time  # Calculate runtime
print(f"  Runtime: {runtime:.6f} seconds\n")

  Runtime: 0.000000 seconds



In [29]:
print("Train Min and Max Sentence Lengths:")
for name, (min_len, max_len) in train_min_max_lengths.items():
    print(f"{name}: Min Sentence Length = {min_len}, Max Sentence Length = {max_len}")

print("\nTest Min and Max Sentence Lengths:")
for name, (min_len, max_len) in test_min_max_lengths.items():
    print(f"{name}: Min Sentence Length = {min_len}, Max Sentence Length = {max_len}")

Train Min and Max Sentence Lengths:
train_tokenize_sent_stemmed_flattened: Min Sentence Length = 2, Max Sentence Length = 176
train_tokenize_sent_lemmatized_flattened: Min Sentence Length = 2, Max Sentence Length = 177
train_tokenize_word_clean_lemmatized_flattend: Min Sentence Length = 1, Max Sentence Length = 125
train_tokenize_word_clean_stemmed_flattend: Min Sentence Length = 1, Max Sentence Length = 123

Test Min and Max Sentence Lengths:
test_tokenize_sent_stemmed_flattened: Min Sentence Length = 10, Max Sentence Length = 138
test_tokenize_sent_lemmatized_flattened: Min Sentence Length = 11, Max Sentence Length = 139
test_tokenize_word_clean_lemmatized_flattend: Min Sentence Length = 1, Max Sentence Length = 46
test_tokenize_word_clean_stemmed_flattend: Min Sentence Length = 1, Max Sentence Length = 46


# 1.5 #emoticon removed, # stop word removed, # token count, # lowercase , # special char removed

In [30]:
def preprocess_text_with_runtime(data):
    """
    Function to preprocess text from the first column of a DataFrame and measure runtime.

    Args:
        data (DataFrame): Input DataFrame containing text data.

    Returns:
        tuple: A list of dictionaries with preprocessing metrics and the runtime in seconds.
    """
    start_time = time.time()  # Start timing

    results_list = []

    for text in data.iloc[:, 0]:  # Access the first column
        # Initialize counters
        phone_count = 0
        account_count = 0
        address_count = 0

        # 1. Remove emoticons
        emoticon_pattern = re.compile("[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F1E0-\U0001F1FF]")
        emoticons = emoticon_pattern.findall(text)
        text_no_emoticons = emoticon_pattern.sub("", text)

        # 2. Check for phone numbers (basic pattern: xxx-xxx-xxxx or (xxx) xxx-xxxx or xxx.xxx.xxxx)
        phone_pattern = re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b")
        phone_count = len(phone_pattern.findall(text_no_emoticons))

        # 3. Check for account numbers (pattern: 8-20 digit numbers)
        account_pattern = re.compile(r"\b\d{8,20}\b")
        account_count = len(account_pattern.findall(text_no_emoticons))

        # 4. Check for addresses (basic heuristic: numbers followed by words)
        address_pattern = re.compile(r"\b\d+\s+[A-Za-z]+\s+[A-Za-z]+")
        address_count = len(address_pattern.findall(text_no_emoticons))

        # 5. Tokenize text
        tokens = word_tokenize(text_no_emoticons)

        # 6. Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens_no_stopwords = [word for word in tokens if word.lower() not in stop_words]
        stopwords_removed = len(tokens) - len(tokens_no_stopwords)

        # 7. Count tokens
        token_count = len(tokens_no_stopwords)

        # 8. Convert to lowercase
        tokens_lowercase = [word.lower() for word in tokens_no_stopwords]

        # 9. Remove special characters
        special_chars = re.findall(r"[^\w\s]", " ".join(tokens_lowercase))
        text_no_special_chars = re.sub(r"[^\w\s]", "", " ".join(tokens_lowercase))

        # Results for this text
        results = {
            "Emoticons Removed": len(emoticons),
            "Stop Words Removed": stopwords_removed,
            "Token Count": token_count,
            "Lowercase Applied": text_no_special_chars,
            "Special Characters Removed": len(special_chars),
            "Phone Numbers Found": phone_count,
            "Account Numbers Found": account_count,
            "Addresses Found": address_count,
        }

        results_list.append(results)

    end_time = time.time()  # End timing
    runtime = end_time - start_time  # Calculate runtime

    return results_list, runtime

In [31]:
train_results, train_runtime = preprocess_text_with_runtime(train_dataset)
test_results , test_runtime = preprocess_text_with_runtime(test_dataset)
# Print the train results
print("Train Results:")
for i, result in enumerate(train_results[:5]):  # Print the first 5 rows for brevity
    print(f"Row {i + 1}: {result}")

# Print the train runtime
print(f"Train Processing Runtime: {train_runtime:.4f} seconds\n")

# Print the test results
print("Test Results:")
for i, result in enumerate(test_results[:5]):  # Print the first 5 rows for brevity
    print(f"Row {i + 1}: {result}")

# Print the test runtime
print(f"Test Processing Runtime: {test_runtime:.4f} seconds")


Train Results:
Row 1: {'Emoticons Removed': 0, 'Stop Words Removed': 9, 'Token Count': 16, 'Lowercase Applied': 'upset ca nt update facebook texting  might cry result school today also  blah ', 'Special Characters Removed': 6, 'Phone Numbers Found': 0, 'Account Numbers Found': 0, 'Addresses Found': 0}
Row 2: {'Emoticons Removed': 0, 'Stop Words Removed': 7, 'Token Count': 14, 'Lowercase Applied': ' kenichan dived many times ball  managed save 50  rest go bounds', 'Special Characters Removed': 3, 'Phone Numbers Found': 0, 'Account Numbers Found': 0, 'Addresses Found': 0}
Row 3: {'Emoticons Removed': 0, 'Stop Words Removed': 4, 'Token Count': 6, 'Lowercase Applied': 'whole body feels itchy like fire', 'Special Characters Removed': 0, 'Phone Numbers Found': 0, 'Account Numbers Found': 0, 'Addresses Found': 0}
Row 4: {'Emoticons Removed': 0, 'Stop Words Removed': 16, 'Token Count': 14, 'Lowercase Applied': ' nationwideclass  s behaving  m mad   ca nt see ', 'Special Characters Removed': 9,

# Readability Scores (e.g., Flesch-Kincaid): Ensure text remains interpretable
# Lexical Diversity: Ratio of unique words to total words. ( read and use)

In [33]:
import textstat
train_stemmed_scores = []
start_time = time.time()  # Start timing

for sentence in train_tokenize_sent_stemmed_flattened:
    score = textstat.flesch_reading_ease(sentence)
    train_stemmed_scores.append(score)

for i in range(6):
    print(train_stemmed_scores[i])
    
end_time = time.time()  # End timing
runtime = end_time - start_time  # Calculate runtime
print(f"  Runtime: {runtime:.6f} seconds\n")

75.54
96.52
112.09
75.54
117.16
119.19
  Runtime: 26.633111 seconds



In [34]:
def lexical_diversity(flattened_words):

    total_words = len(flattened_words)
    unique_words = len(set(flattened_words))  # Using set to get unique words
    return unique_words / total_words if total_words > 0 else 0

# Measure time for train lexical diversity calculation
start_time_train = time.time()  # Start timing
train_lexical_diversity = lexical_diversity(train_tokenize_word_clean_stemmed_flattend)
end_time_train = time.time()  # End timing
train_runtime = end_time_train - start_time_train  # Calculate runtime

# Measure time for test lexical diversity calculation
start_time_test = time.time()  # Start timing
test_lexical_diversity = lexical_diversity(test_tokenize_word_clean_stemmed_flattend)
end_time_test = time.time()  # End timing
test_runtime = end_time_test - start_time_test  # Calculate runtime

# Print results
print(f"Train Lexical Diversity: {train_lexical_diversity:.4f}")
print(f"Train Runtime: {train_runtime:.6f} seconds\n")

print(f"Test Lexical Diversity: {test_lexical_diversity:.4f}")
print(f"Test Runtime: {test_runtime:.6f} seconds")


Train Lexical Diversity: 0.0589
Train Runtime: 0.700918 seconds

Test Lexical Diversity: 0.4508
Test Runtime: 0.001000 seconds
