In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

## Create model - sentiment classification

In [8]:
df = pd.read_csv("processed_dataset.csv")

# Extract TextBlob features
#df['polarity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.polarity)
#df['subjectivity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

# Generate TF-IDF features with N-grams
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
X_tfidf = tfidf.fit_transform(df['Sentence'])
#print(X_tfidf)

# Combine with TextBlob features
import scipy.sparse as sp
#X_textblob = df[['polarity', 'subjectivity']].values
#X_combined = sp.hstack((X_tfidf, X_textblob))

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['Sentiment'], test_size=0.2, random_state=42) #X_combined

# Train classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Classification report
print("\nClassification report:")
print(classification_report(y_test, y_pred))

Model Accuracy: 82.86%

Classification report:
              precision    recall  f1-score   support

       Anger       0.81      0.75      0.78        81
    Euphoria       0.88      0.77      0.82        60
   Happiness       0.81      0.74      0.78        70
     Neutral       0.83      0.96      0.89       186
     Sadness       0.81      0.74      0.78        93

    accuracy                           0.83       490
   macro avg       0.83      0.79      0.81       490
weighted avg       0.83      0.83      0.83       490



## Classify example sentece

In [9]:
# Preprocessing function
def preprocess_text(sentence):
    sentence = sentence.lower()  # Convert to lowercase
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(sentence)  # Tokenization
    #words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & stopword removal
    processed_sentence = " ".join(words)
    
    # Extract TextBlob features
    blob = TextBlob(processed_sentence)
    polarity = blob.sentiment.polarity  # Sentiment polarity (-1 to 1)
    subjectivity = blob.sentiment.subjectivity  # Subjectivity (0 = factual, 1 = subjective)
    
    return processed_sentence#, polarity, subjectivity


sample_sentence = "This environment doesn't feel inviting."
processed_sentence = preprocess_text(sample_sentence)

tfidf_features = tfidf.transform([processed_sentence])

result = model.predict(tfidf_features)
print(result)

['Anger']
