In [5]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import pickle

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load data
df = pd.read_csv("feedback_sample.csv")

# Clean the text
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    if pd.isnull(text):
        return ""
    words = word_tokenize(text.lower())
    words = [stemmer.stem(w) for w in words if w.isalpha() and w not in stop_words and len(w) > 2]
    return " ".join(words)

# Apply cleaning
df['clean_comment'] = df['comment'].apply(clean_text)

# Drop any rows where rating or comment is missing
df = df.dropna(subset=['rating', 'clean_comment'])

# Prepare X and y
X = df['clean_comment']
y = df['rating'].str.strip().str.lower()

# TF-IDF Vectorizer with better settings
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=2, max_df=0.8)
X_vec = vectorizer.fit_transform(X)

# Split into train and test sets (stratify to maintain class balance)
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42, stratify=y
)

# Train a Logistic Regression model with better parameters
model = LogisticRegression(C=5, max_iter=1000)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save model and vectorizer
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(model, f)

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)


Accuracy: 0.9
              precision    recall  f1-score   support

    negative       1.00      0.80      0.89         5
    positive       0.83      1.00      0.91         5

    accuracy                           0.90        10
   macro avg       0.92      0.90      0.90        10
weighted avg       0.92      0.90      0.90        10



[nltk_data] Downloading package punkt to /home/klaus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/klaus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
