# 03 — Data Loading and Preprocessing

This notebook loads the cleaned TED dataset and applies additional preprocessing to prepare the data for modeling and analysis.

Focus areas include:
- Text normalization (lowercasing, punctuation removal, lemmatization)
- Feature extraction (word/sentence/char count, readability)
- Visualizing linguistic structures
- Saving a preprocessed dataset for modeling

In [12]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string

sys.path.append(os.path.abspath('.'))
from utils.setup import init_environment
init_environment()

Environment initialized.


In [16]:
df = pd.read_csv("../data/cleaned_data.csv")
print(f"Shape: {df.shape}")
df.head()

Shape: (4095, 13)


Unnamed: 0,key_column,speaker_occupation,speaker,transcript,event,description,title,recorded_date,views,duration,url,tags,completeness_score
0,https://www.ted.com/talks/maira_kalman_the_ill...,"Illustrator, author",Maira Kalman,What I am always thinking about is what this s...,TED2007,Author and illustrator Maira Kalman talks abou...,The illustrated woman,2007-03-03,815011,1050,https://www.ted.com/talks/maira_kalman_the_ill...,"['art', 'children', 'culture', 'design', 'ente...",100.0
1,https://www.ted.com/talks/virginia_postrel_on_...,Journalist,Virginia Postrel,You might be wondering why I'm wearing sunglas...,TED2004,"In a timely talk, cultural critic Virginia Pos...",On glamour,2004-02-02,424070,975,https://www.ted.com/talks/virginia_postrel_on_...,"['beauty', 'business', 'culture', 'entertainme...",100.0
2,https://www.ted.com/talks/robert_wright_the_ev...,"Journalist, philosopher",Robert Wright,I'm going to talk about compassion and the gol...,TEDSalon 2009 Compassion,Robert Wright uses evolutionary biology and ga...,The evolution of compassion,2009-10-14,298774,1016,https://www.ted.com/talks/robert_wright_the_ev...,"['charter for compassion', 'compassion', 'evol...",100.0
3,https://www.ted.com/talks/dennis_vanengelsdorp...,Bee expert,Dennis vanEngelsdorp,"What I'd like you to do is, just really quickl...",Taste3 2008,Bees are dying in droves. Why? Leading apiaris...,A plea for bees,2008-07-19,765570,988,https://www.ted.com/talks/dennis_vanengelsdorp...,"['animals', 'bees', 'disease', 'food', 'insect...",100.0
4,https://www.ted.com/talks/barry_schuler_genomi...,Entrepreneur,Barry Schuler,"What's happening in genomics, and how this rev...",Taste3 2008,What is genomics? How will it affect our lives...,Genomics 101,2008-06-30,458222,1286,https://www.ted.com/talks/barry_schuler_genomi...,"['biology', 'entrepreneur', 'food', 'genetics'...",100.0


In [None]:
## 1. Dataset Overview and Structure

In [None]:
# Structure + Summary
df.info()

In [None]:
# Column Summary Table
df.describe(include='all').transpose()

In [None]:
# Missing & Type Overview
summary_df = pd.DataFrame({
    "Column": df.columns,
    "Non-Null Count": df.notnull().sum(),
    "Missing Count": df.isnull().sum(),
    "Missing %": df.isnull().mean() * 100,
    "Unique Values": df.nunique(),
    "Data Type": df.dtypes.values
}).sort_values("Missing %", ascending=False)

summary_df.style.bar(subset=["Missing %"], color='lightcoral').format({"Missing %": "{:.2f}%"})


In [None]:
### Missing Values Summary

This dataset has no missing values in any column. No imputation or filling is required.


In [None]:
## 2. Clean Transcript Column


In [None]:
# Base Cleaning
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\s+", " ", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    return text.strip()

df["transcript_clean"] = df["transcript"].apply(clean_text)


In [None]:
### Advanced Cleaning with Stopword Removal and Lemmatization


In [None]:
# Stopword Removal + Lemmatization
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def advanced_clean(text):
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return ' '.join(words)

df["word_count_original"] = df["transcript_clean"].apply(lambda x: len(x.split()))
df["transcript_clean"] = df["transcript_clean"].apply(advanced_clean)
df["word_count"] = df["transcript_clean"].apply(lambda x: len(x.split()))


In [None]:
## 3. Extract Text Features

In [None]:
# Character Count
df["char_count"] = df["transcript_clean"].apply(len)


In [None]:
#Sentence Count
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

df["sentence_count"] = df["transcript"].apply(lambda x: len(sent_tokenize(x)) if pd.notnull(x) else 0)


In [None]:
## 4. Readability Metrics


In [None]:
#Readability Calculations
df["avg_sentence_length"] = df["word_count"] / df["sentence_count"].replace(0, np.nan)
df["avg_word_length"] = df["char_count"] / df["word_count"].replace(0, np.nan)
df["flesch_score"] = 206.835 - (1.015 * df["avg_sentence_length"]) - (84.6 * df["avg_word_length"])
df["flesch_kincaid_grade"] = (0.39 * df["avg_sentence_length"]) + (11.8 * df["avg_word_length"]) - 15.59


In [None]:
## 5. Word Count Comparison Before vs. After Cleaning


In [None]:
# Word Count Comparison Plot
plt.figure(figsize=(10, 5))
sns.histplot(df["word_count_original"], label="Original", color="gray", bins=50, kde=True)
sns.histplot(df["word_count"], label="After Lemmatization", color="cornflowerblue", bins=50, kde=True)
plt.title("Transcript Word Count: Original vs. Cleaned")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.legend()
plt.tight_layout()
plt.savefig("../plots/wordcount_comparison.png")
plt.show()


In [None]:
## 6. Visualize Linguistic Distributions


In [None]:
# Histograms

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

sns.histplot(df["sentence_count"], bins=50, ax=axes[0, 0], color="skyblue")
axes[0, 0].set_title("Sentence Count")

sns.histplot(df["avg_sentence_length"].dropna(), bins=50, ax=axes[0, 1], color="salmon")
axes[0, 1].set_title("Average Sentence Length")

sns.histplot(df["avg_word_length"].dropna(), bins=50, ax=axes[1, 0], color="lightgreen")
axes[1, 0].set_title("Average Word Length")

sns.histplot(df["flesch_score"].dropna(), bins=50, ax=axes[1, 1], color="orchid")
axes[1, 1].set_title("Flesch Reading Ease Score")

plt.tight_layout()
plt.savefig("../plots/readability_distributions.png")
plt.show()


In [None]:
## 7. Correlation Between Linguistic Features


In [None]:
# Correlation Heatmap
features_to_plot = [
    "word_count", "sentence_count", "char_count",
    "avg_sentence_length", "avg_word_length",
    "flesch_score", "flesch_kincaid_grade"
]

plt.figure(figsize=(10, 6))
sns.heatmap(df[features_to_plot].corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation of Linguistic Features")
plt.tight_layout()
plt.savefig("../plots/linguistic_feature_correlation.png")
plt.show()


In [None]:
## 8. Save Preprocessed Data


In [None]:
output_path = "../data/preprocessed_data.csv"
df.to_csv(output_path, index=False)
print(f"Preprocessed data saved to: {output_path}")


In [None]:
-----------------------