In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

## Create model - sentiment classification

In [2]:
df = pd.read_csv("processed_dataset.csv")

# Extract TextBlob features
#df['polarity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.polarity)
#df['subjectivity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

# Generate TF-IDF features with N-grams
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
X_tfidf = tfidf.fit_transform(df['Sentence'])
#print(X_tfidf)

# Combine with TextBlob features
import scipy.sparse as sp
#X_textblob = df[['polarity', 'subjectivity']].values
#X_combined = sp.hstack((X_tfidf, X_textblob))

# Train sentiment model
X_train_sentiment, X_test_sentiment, y_train_sentiment, y_test_sentiment = train_test_split(
    X_tfidf, df['Sentiment'], test_size=0.2, random_state=42)

sentiment_model = RandomForestClassifier(n_estimators=100, random_state=42)
sentiment_model.fit(X_train_sentiment, y_train_sentiment)

# Evaluate sentiment model
y_pred_sentiment = sentiment_model.predict(X_test_sentiment)
sentiment_accuracy = accuracy_score(y_test_sentiment, y_pred_sentiment)
print(f"Sentiment Model Accuracy: {sentiment_accuracy * 100:.2f}%")
print("\nSentiment Classification report:")
print(classification_report(y_test_sentiment, y_pred_sentiment))

Sentiment Model Accuracy: 82.86%

Sentiment Classification report:
              precision    recall  f1-score   support

       Anger       0.81      0.75      0.78        81
    Euphoria       0.88      0.77      0.82        60
   Happiness       0.81      0.74      0.78        70
     Neutral       0.83      0.96      0.89       186
     Sadness       0.81      0.74      0.78        93

    accuracy                           0.83       490
   macro avg       0.83      0.79      0.81       490
weighted avg       0.83      0.83      0.83       490



In [3]:
# Train type model (affirmation/negation)
X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(
    X_tfidf, df['Type'], test_size=0.2, random_state=42)

type_model = RandomForestClassifier(n_estimators=100, random_state=42)
type_model.fit(X_train_type, y_train_type)

# Evaluate type model
y_pred_type = type_model.predict(X_test_type)
type_accuracy = accuracy_score(y_test_type, y_pred_type)
print(f"\nType Model Accuracy: {type_accuracy * 100:.2f}%")
print("\nType Classification report:")
print(classification_report(y_test_type, y_pred_type))


Type Model Accuracy: 97.14%

Type Classification report:
              precision    recall  f1-score   support

 Affirmation       0.97      0.98      0.98       285
    Negation       0.98      0.96      0.97       205

    accuracy                           0.97       490
   macro avg       0.97      0.97      0.97       490
weighted avg       0.97      0.97      0.97       490



In [4]:
# Train factual/subjective model
X_train_factual, X_test_factual, y_train_factual, y_test_factual = train_test_split(
    X_tfidf, df['Factual/Subjective'], test_size=0.2, random_state=42)

factual_model = RandomForestClassifier(n_estimators=100, random_state=42)
factual_model.fit(X_train_factual, y_train_factual)

# Evaluate factual/subjective model
y_pred_factual = factual_model.predict(X_test_factual)
factual_accuracy = accuracy_score(y_test_factual, y_pred_factual)
print(f"\nFactual/Subjective Model Accuracy: {factual_accuracy * 100:.2f}%")
print("\nFactual/Subjective Classification report:")
print(classification_report(y_test_factual, y_pred_factual))


Factual/Subjective Model Accuracy: 93.06%

Factual/Subjective Classification report:
              precision    recall  f1-score   support

     Factual       0.92      0.93      0.92       224
  Subjective       0.94      0.93      0.94       266

    accuracy                           0.93       490
   macro avg       0.93      0.93      0.93       490
weighted avg       0.93      0.93      0.93       490



In [5]:
# Save all models and the vectorizer
joblib.dump(sentiment_model, 'sentiment_model.pkl')
joblib.dump(type_model, 'type_model.pkl')
joblib.dump(factual_model, 'factual_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

## Classify example sentece

In [10]:
# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


# Preprocessing function
def preprocess_text(sentence):
    sentence = sentence.lower()  # Convert to lowercase
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(sentence)  # Tokenization
    #words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & stopword removal
    #words = [lemmatizer.lemmatize(word) for word in words]     # Lemmatization
    #words = [word for word in words if word not in stop_words]      # Stopword removal
    processed_sentence = " ".join(words)
    
    # Extract TextBlob features
    blob = TextBlob(processed_sentence)
    polarity = blob.sentiment.polarity  # Sentiment polarity (-1 to 1)
    subjectivity = blob.sentiment.subjectivity  # Subjectivity (0 = factual, 1 = subjective)
    
    return processed_sentence#, polarity, subjectivity


def analyze_sentence(sentence, sentiment_model, type_model, factual_model, tfidf):
    # Preprocess
    processed = preprocess_text(sentence)
    # Transform using the fitted vectorizer
    features = tfidf.transform([processed])
    # Get predictions from all models
    sentiment_prediction = sentiment_model.predict(features)[0]
    type_prediction = type_model.predict(features)[0]
    factual_prediction = factual_model.predict(features)[0]


    return {
        'sentence': sentence,
        'sentiment': sentiment_prediction,
        'type': type_prediction,
        'factual_subjective': factual_prediction
    }

"""
sample_sentence = "This environment doesn't feel inviting."
processed_sentence = preprocess_text(sample_sentence)

tfidf_features = tfidf.transform([processed_sentence])

result = model.predict(tfidf_features)
print(result)
"""


new_sentence = "Winning the championship was a dream come true!"
result = analyze_sentence(new_sentence, sentiment_model, type_model, factual_model, tfidf)
print("\nAnalysis for:", result['sentence'])
print(f"Sentiment: {result['sentiment']}")
print(f"Type: {result['type']}")
print(f"Factual/Subjective: {result['factual_subjective']}")


Analysis for: Winning the championship was a dream come true!
Sentiment: Euphoria
Type: Affirmation
Factual/Subjective: Subjective
