In [3]:
# Import libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download all required NLTK resources
# Added 'punkt_tab' to avoid new tokenizer errors
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("sentiment_cleaned.csv")
print("Initial shape:", df.shape)
print("\nAvailable columns:", df.columns.tolist())

# Detect the feedback text column automatically
text_column = None
for col in df.columns:
    if col.lower() in ['feedback', 'review', 'comment', 'text', 'message']:
        text_column = col
        break

if text_column is None:
    raise KeyError("No feedback text column found. Please rename it manually.")

print(f"\nDetected feedback column: {text_column}")

# Remove duplicates and missing values
df.drop_duplicates(inplace=True)
df.dropna(subset=[text_column], inplace=True)

# Prepare tools for cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text cleaning function
def clean_text(text):
    # Keep only alphabets
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    text = text.lower().strip()
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize words
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(cleaned)

# Apply cleaning on feedback column
df['clean_text'] = df[text_column].apply(clean_text)

# Show a few cleaned samples
print("\nSample cleaned feedback:\n")
print(df[['clean_text']].head())

# Save cleaned data
df.to_csv("cleaned_feedback.csv", index=False)
print("\n✅ Cleaned dataset saved as cleaned_feedback.csv")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initial shape: (2000, 6)

Available columns: ['ProductName', 'ProductPrice', 'Rate', 'Review', 'Summary', 'Sentiment']

Detected feedback column: Review

Sample cleaned feedback:

        clean_text
0        wonderful
1         terrific
2  useless product
3             okay
4      worth money

✅ Cleaned dataset saved as cleaned_feedback.csv
