# Feature Extraction with TF-IDF Vectorization

This notebook covers feature extraction from preprocessed text:
- Loading preprocessed data
- Converting text to numerical features using TF-IDF
- Exploring the feature matrix
- Preparing data for machine learning models

## Import Required Libraries

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Set random seed for reproducibility
np.random.seed(42)

# Set plot style
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")

## Load Preprocessed Data

In [None]:
# Load the preprocessed dataset
df = pd.read_csv('../data/preprocessed_reviews.csv')

print(f"Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check data quality
print("Data Quality Check:")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Sentiment distribution:")
print(df['sentiment'].value_counts())
print(f"\nSample of cleaned reviews:")
for i in range(3):
    print(f"{i+1}. {df.iloc[i]['cleaned_review']} (Sentiment: {df.iloc[i]['sentiment']})")

## TF-IDF Vectorization

Convert text data to numerical features using Term Frequency-Inverse Document Frequency (TF-IDF).

In [None]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=1000,        # Limit to top 1000 features
    min_df=2,                 # Ignore terms that appear in less than 2 documents
    max_df=0.8,               # Ignore terms that appear in more than 80% of documents
    ngram_range=(1, 2),       # Use unigrams and bigrams
    lowercase=True            # Convert to lowercase (already done in preprocessing)
)

print("TF-IDF Vectorizer initialized with parameters:")
print(f"- Max features: 1000")
print(f"- Min document frequency: 2")
print(f"- Max document frequency: 0.8")
print(f"- N-gram range: (1, 2)")

In [None]:
# Fit and transform the cleaned text data
print("Applying TF-IDF vectorization...")

# Use cleaned_review column for feature extraction
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment'].values

print(f"✓ TF-IDF matrix shape: {X_tfidf.shape}")
print(f"✓ Number of features: {X_tfidf.shape[1]}")
print(f"✓ Number of samples: {X_tfidf.shape[0]}")
print(f"✓ Target variable shape: {y.shape}")
print(f"✓ Matrix sparsity: {1 - (X_tfidf.nnz / (X_tfidf.shape[0] * X_tfidf.shape[1])):.3f}")

## Explore Feature Matrix

In [None]:
# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

print(f"Feature vocabulary size: {len(feature_names)}")
print(f"\nSample features (first 20):")
print(feature_names[:20])
print(f"\nSample features (last 20):")
print(feature_names[-20:])

In [None]:
# Analyze top TF-IDF features
# Convert sparse matrix to dense for analysis (only for small datasets)
if X_tfidf.shape[1] <= 1000:  # Only if manageable size
    # Get mean TF-IDF scores for each feature
    mean_tfidf_scores = np.array(X_tfidf.mean(axis=0)).flatten()
    
    # Create a DataFrame for analysis
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'mean_tfidf': mean_tfidf_scores
    }).sort_values('mean_tfidf', ascending=False)
    
    print("Top 15 features by mean TF-IDF score:")
    print(feature_importance_df.head(15))
    
    # Visualize top features
    plt.figure(figsize=(12, 6))
    top_features = feature_importance_df.head(15)
    plt.barh(range(len(top_features)), top_features['mean_tfidf'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Mean TF-IDF Score')
    plt.title('Top 15 Features by Mean TF-IDF Score')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print("Feature matrix too large for detailed analysis. Skipping feature importance calculation.")

## Compare TF-IDF vs Count Vectorization

In [None]:
# Also create Count Vectorizer for comparison
count_vectorizer = CountVectorizer(
    max_features=1000,
    min_df=2,
    max_df=0.8,
    ngram_range=(1, 2),
    lowercase=True
)

# Fit and transform with Count Vectorizer
X_count = count_vectorizer.fit_transform(df['cleaned_review'])

print("Count Vectorization Results:")
print(f"✓ Count matrix shape: {X_count.shape}")
print(f"✓ Matrix sparsity: {1 - (X_count.nnz / (X_count.shape[0] * X_count.shape[1])):.3f}")

# Compare vocabulary overlap
tfidf_vocab = set(tfidf_vectorizer.get_feature_names_out())
count_vocab = set(count_vectorizer.get_feature_names_out())
overlap = len(tfidf_vocab.intersection(count_vocab))

print(f"\nVocabulary Comparison:")
print(f"TF-IDF vocabulary size: {len(tfidf_vocab)}")
print(f"Count vocabulary size: {len(count_vocab)}")
print(f"Vocabulary overlap: {overlap} features ({overlap/len(tfidf_vocab)*100:.1f}%)")

## Split Data for Model Training

In [None]:
# Split the data into training and testing sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42, stratify=y
)

X_train_count, X_test_count, _, _ = train_test_split(
    X_count, y, test_size=0.2, random_state=42, stratify=y
)

print("Data split completed:")
print(f"Training set - TF-IDF: {X_train_tfidf.shape}, Count: {X_train_count.shape}")
print(f"Test set - TF-IDF: {X_test_tfidf.shape}, Count: {X_test_count.shape}")
print(f"Training labels: {y_train.shape}")
print(f"Test labels: {y_test.shape}")

# Check class distribution in splits
print(f"\nTraining set sentiment distribution:")
print(pd.Series(y_train).value_counts())
print(f"\nTest set sentiment distribution:")
print(pd.Series(y_test).value_counts())

## Save Processed Features

In [None]:
# Save the vectorizers and processed data for next steps
import pickle
import os

# Create results directory if it doesn't exist
os.makedirs('../results', exist_ok=True)

# Save vectorizers
with open('../results/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

with open('../results/count_vectorizer.pkl', 'wb') as f:
    pickle.dump(count_vectorizer, f)

# Save processed arrays
np.save('../results/X_train_tfidf.npy', X_train_tfidf.toarray())
np.save('../results/X_test_tfidf.npy', X_test_tfidf.toarray())
np.save('../results/X_train_count.npy', X_train_count.toarray())
np.save('../results/X_test_count.npy', X_test_count.toarray())
np.save('../results/y_train.npy', y_train)
np.save('../results/y_test.npy', y_test)

print("✓ Vectorizers saved to results/")
print("✓ Processed feature matrices saved to results/")
print("✓ Train/test splits saved to results/")

# Save feature names for reference
feature_info = {
    'tfidf_features': feature_names.tolist(),
    'count_features': count_vectorizer.get_feature_names_out().tolist(),
    'feature_matrix_shape': X_tfidf.shape,
    'train_test_split': {
        'test_size': 0.2,
        'random_state': 42,
        'stratify': True
    }
}

with open('../results/feature_info.pkl', 'wb') as f:
    pickle.dump(feature_info, f)

print("✓ Feature information saved")

In [None]:
# Summary of feature extraction step
print("\n=== FEATURE EXTRACTION COMPLETED ===")
print("✓ TF-IDF vectorization applied successfully")
print("✓ Count vectorization applied for comparison")
print("✓ Feature matrix created with 1000 features")
print("✓ Data split into train/test sets (80/20)")
print("✓ All processed data saved for model training")
print(f"✓ Ready for training with {X_train_tfidf.shape[0]} training samples")
print("\nNext: Model training with Logistic Regression and Naive Bayes")