# Data Cleaning Notebook

In [None]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import ast

# Load data
reviews = pd.read_csv('../data/raw/stanford_reviews.csv')

# Basic text cleaning
reviews['cleaned_text'] = reviews['text'].str.replace('<br />', ' ')
reviews['cleaned_text'] = reviews['cleaned_text'].str.replace(r'[^\w\s]', '')

# Sentiment analysis with TextBlob
def get_sentiment(text):
    analysis = TextBlob(str(text))
    return analysis.sentiment.polarity

reviews['sentiment_score'] = reviews['cleaned_text'].apply(get_sentiment)
reviews['sentiment'] = pd.cut(reviews['sentiment_score'],
                             bins=[-1, -0.1, 0.1, 1],
                             labels=['negative', 'neutral', 'positive'])

# Save processed data
reviews.to_csv('../data/processed/cleaned_reviews.csv', index=False)

print("Data cleaning complete. Saved cleaned data.")