# Part 2: Preprocess Data

This notebook loads the raw form responses and preprocesses them for clustering analysis.

## Input
- `raw_form_responses.pickle`: Raw form data from Part 1
- `form_metadata.json`: Form metadata

## Output  
- `preprocessed_data.pickle`: Cleaned and vectorized text data ready for clustering

In [None]:
# Install required packages for text processing
!pip install pandas scikit-learn nltk textblob

import pandas as pd
import numpy as np
import pickle
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk

# Download required NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

print("✅ Libraries imported and NLTK data downloaded!")

In [None]:
# Load data from Part 1
print("📥 Loading data from Part 1...")

try:
    with open('raw_form_responses.pickle', 'rb') as f:
        data = pickle.load(f)
    
    df = data['dataframe']
    form_structure = data['form_structure']
    raw_responses = data['raw_responses']
    
    with open('form_metadata.json', 'r') as f:
        metadata = json.load(f)
    
    print(f"✅ Data loaded successfully!")
    print(f"📊 DataFrame shape: {df.shape}")
    print(f"📋 Form: {metadata['form_title']}")
    print(f"🔢 Total responses: {metadata['total_responses']}")
    
except FileNotFoundError:
    print("❌ Data files not found. Please run Part 1 first!")
    raise

# Display data overview
print(f"\n📂 Available columns:")
for i, col in enumerate(df.columns):
    print(f"  {i+1}. {col}")
    
display(df.head(2))

In [None]:
def preprocess_text(text):
    """Clean and preprocess text data"""
    if pd.isna(text) or text == '':
        return ''
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove special characters and digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    return ' '.join(tokens)

def combine_text_responses(df, text_columns=None):
    """Combine multiple text columns into a single text for analysis"""
    if text_columns is None:
        # Identify text columns (exclude metadata columns)
        text_columns = [col for col in df.columns 
                       if col not in ['response_id', 'create_time', 'last_submitted_time']]
    
    # Combine text from all specified columns
    combined_texts = []
    for idx, row in df.iterrows():
        text_parts = []
        for col in text_columns:
            if pd.notna(row[col]) and str(row[col]).strip():
                text_parts.append(str(row[col]))
        
        combined_text = ' '.join(text_parts)
        combined_texts.append(combined_text)
    
    return combined_texts

print("🔧 Text preprocessing functions defined!")

In [None]:
# Identify text columns for clustering
text_columns = [col for col in df.columns 
                if col not in ['response_id', 'create_time', 'last_submitted_time']]

print(f"🔍 Text columns identified: {text_columns}")

# Combine and preprocess text responses
if len(df) > 0:
    print(f"\n🔄 Processing {len(df)} responses...")
    
    combined_texts = combine_text_responses(df, text_columns)
    processed_texts = [preprocess_text(text) for text in combined_texts]
    
    # Filter out empty responses
    non_empty_indices = [i for i, text in enumerate(processed_texts) if text.strip()]
    processed_texts_filtered = [processed_texts[i] for i in non_empty_indices]
    filtered_df = df.iloc[non_empty_indices].copy()
    original_texts = [combined_texts[i] for i in non_empty_indices]
    
    print(f"📊 After filtering: {len(processed_texts_filtered)} non-empty responses")
    
    # Add processed text to DataFrame
    filtered_df['processed_text'] = processed_texts_filtered
    filtered_df['original_combined_text'] = original_texts
    
    print("\n📝 Sample processed texts:")
    for i in range(min(3, len(processed_texts_filtered))):
        print(f"  {i+1}. Original: {original_texts[i][:100]}...")
        print(f"     Processed: {processed_texts_filtered[i][:100]}...")
        print()
        
else:
    print("❌ No data available for processing.")

In [None]:
# Vectorize text using TF-IDF
if len(processed_texts_filtered) > 0:
    print("🔢 Vectorizing text using TF-IDF...")
    
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        max_features=1000,  # Limit to top 1000 features
        min_df=2,          # Ignore terms that appear in less than 2 documents
        max_df=0.8,        # Ignore terms that appear in more than 80% of documents
        ngram_range=(1, 2) # Include both unigrams and bigrams
    )
    
    # Fit and transform the text
    tfidf_matrix = vectorizer.fit_transform(processed_texts_filtered)
    
    print(f"📊 TF-IDF matrix shape: {tfidf_matrix.shape}")
    print(f"🔤 Number of features: {len(vectorizer.get_feature_names_out())}")
    
    # Convert to dense array for clustering (if not too large)
    if tfidf_matrix.shape[0] * tfidf_matrix.shape[1] < 100000:
        X_dense = tfidf_matrix.toarray()
        print("📦 Converted to dense matrix for clustering")
    else:
        X_dense = tfidf_matrix
        print("📦 Keeping sparse matrix due to size")
    
    # Save preprocessed data
    preprocessed_data = {
        'dataframe': filtered_df,
        'processed_texts': processed_texts_filtered,
        'original_texts': original_texts,
        'tfidf_matrix': X_dense,
        'vectorizer': vectorizer,
        'text_columns': text_columns,
        'feature_names': vectorizer.get_feature_names_out()
    }
    
    with open('preprocessed_data.pickle', 'wb') as f:
        pickle.dump(preprocessed_data, f)
    
    print("✅ Preprocessed data saved!")
    print("📁 File created: preprocessed_data.pickle")
    print(f"📊 Ready for clustering: {len(processed_texts_filtered)} responses")
    print(f"🔤 Features: {tfidf_matrix.shape[1]} TF-IDF terms")
    
    # Show sample features
    feature_names = vectorizer.get_feature_names_out()
    print(f"\n🔍 Sample features: {feature_names[:10]}")
    
else:
    print("❌ No processed texts available for vectorization.")