In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import joblib
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score


# Import Cross-validation methods
import optuna
from optuna.integration import OptunaSearchCV
from optuna.distributions import IntDistribution, FloatDistribution, CategoricalDistribution
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, recall_score

  from .autonotebook import tqdm as notebook_tqdm


## Pre-process dataset sentences

In [2]:
# Load dataset
file_path = "dataset.csv"  # Change this to your actual dataset path
df = pd.read_csv(file_path)

# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Preprocessing function
def preprocess_text(sentence):
    sentence = sentence.lower()  # Convert to lowercase
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(sentence)  # Tokenization
    #words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & stopword removal
    #words = [lemmatizer.lemmatize(word) for word in words]     # Lemmanization
    #words = [word for word in words if word not in stop_words]     # Stop-word removal
    processed_sentence = " ".join(words)
    
    # Extract TextBlob features
    blob = TextBlob(processed_sentence)
    polarity = blob.sentiment.polarity  # Sentiment polarity (-1 to 1)
    subjectivity = blob.sentiment.subjectivity  # Subjectivity (0 = factual, 1 = subjective)
    
    return processed_sentence#, polarity, subjectivity

# Apply preprocessing
# Create a copy of the original dataframe
df_processed = df.copy()

# Apply preprocessing to the 'Sentence' column
df_processed['Sentence'] = df['Sentence'].apply(preprocess_text)

# Save processed data
df_processed.to_csv("processed_dataset.csv", index=False)

print("Preprocessing complete. Processed dataset saved as 'processed_dataset.csv'")
df_processed.head()

Preprocessing complete. Processed dataset saved as 'processed_dataset.csv'


Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment
0,the sky is blue,Affirmation,Factual,Neutral
1,i love sunny days,Affirmation,Subjective,Happiness
2,this pizza is disgusting,Affirmation,Subjective,Anger
3,water boils at 100 degrees celsius,Affirmation,Factual,Neutral
4,i dont think this is a good idea,Negation,Subjective,Sadness


## Create model - sentiment classification

In [3]:
df = pd.read_csv("processed_dataset.csv")

# Shuffling: frac=1 shuffles all rows, random_state for reproducibility
df = df.sample(frac=1, random_state=42)
# Reset index after shuffling
df.reset_index(drop=True, inplace=True)

# Generate TF-IDF features with N-grams
tfidf = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
X = tfidf.fit_transform(df['Sentence'])
#print(X)

# Train sentiment model
X_train_sentiment, X_test_sentiment, y_train_sentiment, y_test_sentiment = train_test_split(
    X, df['Sentiment'], test_size=0.2, random_state=42)

base_model = RandomForestClassifier()


# Custom scoring metric
scorer = make_scorer(accuracy_score)

param_space_random_forest = {
    "n_estimators": IntDistribution(50, 200, step=50),
    "max_depth": CategoricalDistribution([None, 10, 20]),
    "min_samples_split": IntDistribution(2, 10, step=1),
    "min_samples_leaf": IntDistribution(1, 4, step=1),
    "bootstrap": CategoricalDistribution([True, False]),
    "max_features": CategoricalDistribution([None, 'sqrt', 'log2'])
}

optuna_random_forest = OptunaSearchCV(
    base_model,
    param_space_random_forest,
    cv=StratifiedKFold(n_splits=5),
    n_trials=20,
    scoring=scorer,
    random_state=68,
    timeout=600
)

# Train the model using Optuna optimization
optuna_random_forest.fit(X_train_sentiment, y_train_sentiment)


# Get the best parameters and score
print("\nBest Parameters:", optuna_random_forest.best_params_)
print(f"Best Cross-Validation Score: {optuna_random_forest.best_score_:.4f}")

# Get the best model
sentiment_model = optuna_random_forest.best_estimator_

# Evaluate sentiment model
y_pred_sentiment = sentiment_model.predict(X_test_sentiment)
sentiment_accuracy = accuracy_score(y_test_sentiment, y_pred_sentiment)
print(f"Sentiment Model Accuracy: {sentiment_accuracy * 100:.2f}%")
print("\nSentiment Classification report:")
print(classification_report(y_test_sentiment, y_pred_sentiment))

  optuna_random_forest = OptunaSearchCV(
[I 2025-02-28 11:12:06,391] A new study created in memory with name: no-name-2f47a27a-5b4c-4a4b-b2d0-edec1d44c661
[I 2025-02-28 11:12:34,952] Trial 0 finished with value: 0.7069836630304295 and parameters: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 3, 'bootstrap': True, 'max_features': None}. Best is trial 0 with value: 0.7069836630304295.
[I 2025-02-28 11:12:35,530] Trial 1 finished with value: 0.32720783965760214 and parameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 9, 'min_samples_leaf': 4, 'bootstrap': True, 'max_features': 'log2'}. Best is trial 0 with value: 0.7069836630304295.
[I 2025-02-28 11:12:38,057] Trial 2 finished with value: 0.36089305287332324 and parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 3, 'bootstrap': False, 'max_features': 'log2'}. Best is trial 0 with value: 0.7069836630304295.
[I 2025-02-28 11:13:35,032] Trial 3


Best Parameters: {'n_estimators': 50, 'max_depth': None, 'min_samples_split': 5, 'min_samples_leaf': 1, 'bootstrap': True, 'max_features': 'sqrt'}
Best Cross-Validation Score: 0.8096
Sentiment Model Accuracy: 77.14%

Sentiment Classification report:
              precision    recall  f1-score   support

       Anger       0.77      0.70      0.73        73
    Euphoria       0.82      0.60      0.69        52
   Happiness       0.78      0.62      0.69        74
     Neutral       0.78      0.94      0.85       189
     Sadness       0.73      0.72      0.72       102

    accuracy                           0.77       490
   macro avg       0.78      0.71      0.74       490
weighted avg       0.77      0.77      0.77       490



## Polarity classification

In [4]:
# Train type model (affirmation/negation)
X_train_type, X_test_type, y_train_type, y_test_type = train_test_split(
    X, df['Type'], test_size=0.2, random_state=42)

type_model = RandomForestClassifier(n_estimators=100, random_state=42)
type_model.fit(X_train_type, y_train_type)

# Evaluate type model
y_pred_type = type_model.predict(X_test_type)
type_accuracy = accuracy_score(y_test_type, y_pred_type)
print(f"\nType Model Accuracy: {type_accuracy * 100:.2f}%")
print("\nType Classification report:")
print(classification_report(y_test_type, y_pred_type))


Type Model Accuracy: 95.51%

Type Classification report:
              precision    recall  f1-score   support

 Affirmation       0.94      0.99      0.97       306
    Negation       0.99      0.89      0.94       184

    accuracy                           0.96       490
   macro avg       0.96      0.94      0.95       490
weighted avg       0.96      0.96      0.95       490



## Subjectiveness classification

In [5]:
# Train factual/subjective model
X_train_factual, X_test_factual, y_train_factual, y_test_factual = train_test_split(
    X, df['Factual/Subjective'], test_size=0.2, random_state=42)

factual_model = RandomForestClassifier(n_estimators=100, random_state=42)
factual_model.fit(X_train_factual, y_train_factual)

# Evaluate factual/subjective model
y_pred_factual = factual_model.predict(X_test_factual)
factual_accuracy = accuracy_score(y_test_factual, y_pred_factual)
print(f"\nFactual/Subjective Model Accuracy: {factual_accuracy * 100:.2f}%")
print("\nFactual/Subjective Classification report:")
print(classification_report(y_test_factual, y_pred_factual))


Factual/Subjective Model Accuracy: 94.69%

Factual/Subjective Classification report:
              precision    recall  f1-score   support

     Factual       0.92      0.96      0.94       212
  Subjective       0.97      0.94      0.95       278

    accuracy                           0.95       490
   macro avg       0.94      0.95      0.95       490
weighted avg       0.95      0.95      0.95       490



## Export models and vectorizers

In [6]:
# Create the "saved_models" folder if it doesn't exist
folder_name = "saved_models"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Save the models and vectorizer to the "saved_models" folder
joblib.dump(sentiment_model, os.path.join(folder_name, 'sentiment_model.pkl'))
joblib.dump(type_model, os.path.join(folder_name, 'type_model.pkl'))
joblib.dump(factual_model, os.path.join(folder_name, 'factual_model.pkl'))
joblib.dump(tfidf, os.path.join(folder_name, 'tfidf_vectorizer.pkl'))

print(f"Models and vectorizer saved to the '{folder_name}' folder.")

Models and vectorizer saved to the 'saved_models' folder.


## Classify example sentece

In [7]:
# Initialize tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


# Preprocessing function
def preprocess_text(sentence):
    sentence = sentence.lower()  # Convert to lowercase
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    words = word_tokenize(sentence)  # Tokenization
    #words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]  # Lemmatization & stopword removal
    #words = [lemmatizer.lemmatize(word) for word in words]     # Lemmatization
    #words = [word for word in words if word not in stop_words]      # Stopword removal
    processed_sentence = " ".join(words)
    
    # Extract TextBlob features
    blob = TextBlob(processed_sentence)
    polarity = blob.sentiment.polarity  # Sentiment polarity (-1 to 1)
    subjectivity = blob.sentiment.subjectivity  # Subjectivity (0 = factual, 1 = subjective)
    
    return processed_sentence#, polarity, subjectivity


def analyze_sentence(sentence, sentiment_model, type_model, factual_model, tfidf):
    # Preprocess
    processed = preprocess_text(sentence)
    # Transform using the fitted vectorizer
    features = tfidf.transform([processed])
    # Get predictions from all models
    sentiment_prediction = sentiment_model.predict(features)[0]
    type_prediction = type_model.predict(features)[0]
    factual_prediction = factual_model.predict(features)[0]


    return {
        'sentence': sentence,
        'sentiment': sentiment_prediction,
        'type': type_prediction,
        'factual_subjective': factual_prediction
    }

"""
sample_sentence = "This environment doesn't feel inviting."
processed_sentence = preprocess_text(sample_sentence)

tfidf_features = tfidf.transform([processed_sentence])

result = model.predict(tfidf_features)
print(result)
"""


new_sentence = "Winning the championship was a dream come true!"
result = analyze_sentence(new_sentence, sentiment_model, type_model, factual_model, tfidf)
print("\nAnalysis for:", result['sentence'])
print(f"Sentiment: {result['sentiment']}")
print(f"Type: {result['type']}")
print(f"Factual/Subjective: {result['factual_subjective']}")


Analysis for: Winning the championship was a dream come true!
Sentiment: Euphoria
Type: Affirmation
Factual/Subjective: Subjective
