# Advanced Feature Engineering for Student Explanations

This notebook performs comprehensive feature engineering on student explanation text data, including:
- Text statistics features
- Mathematical expression features
- TF-IDF vectorization
- Bag of Words features
- Categorical encoding

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

print("Starting advanced feature engineering...")

## 2. Load and Explore Dataset

In [None]:
# Load the dataset
df = pd.read_csv('cleaned_train.csv')
print(f"Dataset loaded: {df.shape[0]} rows, {df.columns.tolist()}")

# Display basic info about the dataset
print("\nDataset Info:")
print(df.info())
print("\nFirst few rows:")
df.head()

## 3. Prepare Text Data

In [None]:
# Handle missing values in the StudentExplanation column
text_data = df['StudentExplanation'].fillna('').astype(str).tolist()

print(f"Text data prepared: {len(text_data)} samples")
print(f"Sample explanation: {text_data[0][:100]}...")

## 4. Create Text Statistics Features

In [None]:
# Basic text statistics features
print("Creating text statistics features...")

df['text_length'] = df['StudentExplanation'].fillna('').astype(str).str.len()
df['word_count'] = df['StudentExplanation'].fillna('').astype(str).str.split().str.len()
df['sentence_count'] = df['StudentExplanation'].fillna('').astype(str).str.count('\.')
df['exclamation_count'] = df['StudentExplanation'].fillna('').astype(str).str.count('!')
df['question_count'] = df['StudentExplanation'].fillna('').astype(str).str.count('\?')
df['uppercase_count'] = df['StudentExplanation'].fillna('').astype(str).str.count('[A-Z]')
df['digit_count'] = df['StudentExplanation'].fillna('').astype(str).str.count('\d')

# Display statistics
text_features = ['text_length', 'word_count', 'sentence_count', 'exclamation_count', 
                'question_count', 'uppercase_count', 'digit_count']
print("\nText Statistics Summary:")
df[text_features].describe()

## 5. Create Mathematical Expression Features

In [None]:
# Mathematical expression features
print("Creating mathematical expression features...")

df['has_fraction'] = df['StudentExplanation'].fillna('').astype(str).str.contains(r'\d+/\d+').astype(int)
df['has_decimal'] = df['StudentExplanation'].fillna('').astype(str).str.contains(r'\d+\.\d+').astype(int)
df['has_percentage'] = df['StudentExplanation'].fillna('').astype(str).str.contains(r'\d+%').astype(int)
df['math_operations'] = df['StudentExplanation'].fillna('').astype(str).str.count(r'[\+\-\*\/\=]')

# Display mathematical features summary
math_features = ['has_fraction', 'has_decimal', 'has_percentage', 'math_operations']
print("\nMathematical Features Summary:")
for feature in math_features:
    print(f"{feature}: {df[feature].sum()} samples have this feature")

## 6. Encode Categorical Features

In [None]:
# Encode categorical features
print("Encoding categorical features...")

le_category = LabelEncoder()
df['Category_encoded'] = le_category.fit_transform(df['Category'])

print("\nCategory encoding mapping:")
for i, category in enumerate(le_category.classes_):
    print(f"{category}: {i}")

## 7. Create TF-IDF Features

In [None]:
# Initialize the TF-IDF Vectorizer with additional parameters
print("Creating TF-IDF features...")

vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=500,
    ngram_range=(1, 2),
    min_df=2,  # Ignore terms that appear in less than 2 documents
    max_df=0.95  # Ignore terms that appear in more than 95% of documents
)

# Fit and transform the text data
tfidf_matrix = vectorizer.fit_transform(text_data)

# Convert to DataFrame
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=[f'tfidf_{col}' for col in vectorizer.get_feature_names_out()]
)

print(f"TF-IDF features created: {tfidf_df.shape[1]} features")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

## 8. Create Bag of Words Features

In [None]:
# Create bag of words features for comparison
print("Creating Bag of Words features...")

count_vectorizer = CountVectorizer(
    lowercase=True,
    stop_words='english',
    max_features=200,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95
)

bow_matrix = count_vectorizer.fit_transform(text_data)
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns=[f'bow_{col}' for col in count_vectorizer.get_feature_names_out()]
)

print(f"Bag of Words features created: {bow_df.shape[1]} features")
print(f"BoW matrix shape: {bow_matrix.shape}")

## 9. Combine All Features

In [None]:
# Combine all features
print("Combining all features...")

feature_columns = ['text_length', 'word_count', 'sentence_count', 'exclamation_count', 
                  'question_count', 'uppercase_count', 'digit_count', 'has_fraction',
                  'has_decimal', 'has_percentage', 'math_operations', 'Category_encoded']

# Create the combined dataset
combined_df = pd.concat([
    df.reset_index(drop=True),
    tfidf_df.reset_index(drop=True),
    bow_df.reset_index(drop=True)
], axis=1)

print(f"Combined dataset shape: {combined_df.shape}")
print(f"Feature columns: {len(feature_columns)} engineered features")

## 10. Save Results and Models

In [None]:
# Save the results
print("Saving results...")

tfidf_df.to_csv('tfidf_vectors.csv', index=False)
bow_df.to_csv('bow_vectors.csv', index=False)
combined_df.to_csv('combined_data_with_tfidf.csv', index=False)

# Save feature importance information
feature_info = pd.DataFrame({
    'feature_name': feature_columns,
    'feature_type': ['text_stat'] * 7 + ['math_feature'] * 4 + ['categorical']
})
feature_info.to_csv('feature_info.csv', index=False)

# Save the vectorizers
joblib.dump(vectorizer, 'tfidf_vectorizer.joblib')
joblib.dump(count_vectorizer, 'bow_vectorizer.joblib')
joblib.dump(le_category, 'category_encoder.joblib')

print("All files saved successfully!")

## 11. Feature Engineering Summary

In [None]:
# Print summary statistics
print("\n=== Feature Engineering Summary ===")
print(f"Original features: {len(df.columns)}")
print(f"TF-IDF features: {tfidf_df.shape[1]}")
print(f"Bag of Words features: {bow_df.shape[1]}")
print(f"Total features in combined dataset: {combined_df.shape[1]}")
print(f"Text statistics features: {len([col for col in feature_columns if 'text_' in col or 'word_' in col or 'sentence_' in col or 'exclamation_' in col or 'question_' in col or 'uppercase_' in col or 'digit_' in col])}")
print(f"Mathematical features: {len([col for col in feature_columns if 'has_' in col or 'math_' in col])}")

print("\nFiles created:")
print("- tfidf_vectors.csv (TF-IDF features)")
print("- bow_vectors.csv (Bag of Words features)")
print("- combined_data_with_tfidf.csv (All features combined)")
print("- feature_info.csv (Feature metadata)")
print("- tfidf_vectorizer.joblib (TF-IDF model)")
print("- bow_vectorizer.joblib (Bag of Words model)")
print("- category_encoder.joblib (Category encoder)")

print("\nFeature engineering completed successfully!")

## 12. Data Visualization (Optional)

In [None]:
# Optional: Create some visualizations
plt.figure(figsize=(15, 10))

# Plot text statistics
plt.subplot(2, 3, 1)
plt.hist(df['text_length'], bins=50, alpha=0.7)
plt.title('Distribution of Text Length')
plt.xlabel('Text Length')
plt.ylabel('Frequency')

plt.subplot(2, 3, 2)
plt.hist(df['word_count'], bins=50, alpha=0.7)
plt.title('Distribution of Word Count')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

plt.subplot(2, 3, 3)
category_counts = df['Category'].value_counts()
plt.pie(category_counts.values, labels=category_counts.index, autopct='%1.1f%%')
plt.title('Distribution of Categories')

plt.subplot(2, 3, 4)
math_feature_sums = df[['has_fraction', 'has_decimal', 'has_percentage']].sum()
plt.bar(math_feature_sums.index, math_feature_sums.values)
plt.title('Mathematical Features Frequency')
plt.xticks(rotation=45)

plt.subplot(2, 3, 5)
plt.scatter(df['text_length'], df['word_count'], alpha=0.5)
plt.title('Text Length vs Word Count')
plt.xlabel('Text Length')
plt.ylabel('Word Count')

plt.tight_layout()
plt.show()

print("Visualizations completed!")