In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Read the CSV file
df = pd.read_csv('dataset.csv')

# Shuffle the data
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment
0,I don't like it when plans change unexpectedly.,Negation,Subjective,Anger
1,I never get tired of this feeling!,Negation,Subjective,Euphoria
2,Stars are born in nebulae.,Affirmation,Factual,Neutral
3,I can't believe they did that to me.,Negation,Subjective,Anger
4,I don't like being underestimated.,Negation,Subjective,Anger


In [2]:
# Verify Class balance "Type" column
df['Type'].value_counts()  

Type
Affirmation    1435
Negation       1014
Name: count, dtype: int64

In [3]:
# Verify Class balance "Factual/Subjective" column
df['Factual/Subjective'].value_counts()  

Factual/Subjective
Subjective    1457
Factual        992
Name: count, dtype: int64

In [4]:
# Verify Class balance "Sentiment" column
df['Sentiment'].value_counts()  

Sentiment
Neutral      830
Anger        465
Sadness      442
Happiness    392
Euphoria     320
Name: count, dtype: int64

In [5]:
# Check for missing values
df.isnull().sum()

Sentence              0
Type                  0
Factual/Subjective    0
Sentiment             0
dtype: int64

In [6]:
# Verify Class balance "Type" column
print(df['Type'].value_counts())
print("-----------------------------")
# Verify Class balance "Factual/Subjective" column
print(df['Factual/Subjective'].value_counts())
print("-----------------------------")
# Verify Class balance "Sentiment" column
print(df['Sentiment'].value_counts())
print("-----------------------------")

Type
Affirmation    1435
Negation       1014
Name: count, dtype: int64
-----------------------------
Factual/Subjective
Subjective    1457
Factual        992
Name: count, dtype: int64
-----------------------------
Sentiment
Neutral      830
Anger        465
Sadness      442
Happiness    392
Euphoria     320
Name: count, dtype: int64
-----------------------------


In [7]:
# verify and print all repeated values in the column Sentence
print(df['Sentence'].value_counts())

Sentence
I don't like being rushed.                           5
I don't like being ignored.                          5
The stars twinkle in the night sky.                  4
I don't like being misunderstood.                    4
I don't like loud music.                             4
                                                    ..
Earth's core is liquid iron.                         1
I dislike how she treats people.                     1
I don’t feel rested after sleep.                     1
Crying is a healthy way to express emotions.         1
This is the most frustrating experience I've had!    1
Name: count, Length: 2094, dtype: int64


In [8]:
# Drop duplicates
df = df.drop_duplicates(subset='Sentence')

In [9]:
df

Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment
0,I don't like it when plans change unexpectedly.,Negation,Subjective,Anger
1,I never get tired of this feeling!,Negation,Subjective,Euphoria
2,Stars are born in nebulae.,Affirmation,Factual,Neutral
3,I can't believe they did that to me.,Negation,Subjective,Anger
4,I don't like being underestimated.,Negation,Subjective,Anger
...,...,...,...,...
2439,The endeavor was a complete bust.,Negation,Subjective,Anger
2442,Infinity is not a number in mathematics.,Negation,Factual,Neutral
2444,I do not feel any discomfort.,Negation,Factual,Euphoria
2445,I don’t enjoy waking up early.,Negation,Subjective,Sadness


In [10]:
# Verify Class balance "Type" column
print(df['Type'].value_counts())
print("-----------------------------")
# Verify Class balance "Factual/Subjective" column
print(df['Factual/Subjective'].value_counts())
print("-----------------------------")
# Verify Class balance "Sentiment" column
print(df['Sentiment'].value_counts())
print("-----------------------------")

Type
Affirmation    1271
Negation        823
Name: count, dtype: int64
-----------------------------
Factual/Subjective
Subjective    1254
Factual        840
Name: count, dtype: int64
-----------------------------
Sentiment
Neutral      720
Anger        376
Sadness      363
Happiness    339
Euphoria     296
Name: count, dtype: int64
-----------------------------


In [15]:
# implement textblob library to calculate the polarity of each sentence
from textblob import TextBlob
df['polarity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()


Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment,polarity
0,I despise waking up early.,Affirmation,Subjective,Anger,0.1
1,Chocolate is my favorite dessert.,Affirmation,Subjective,Happiness,0.5
2,I never thought I'd feel this heartbroken.,Negation,Subjective,Sadness,0.0
3,Childhood memories warm my heart.,Affirmation,Subjective,Happiness,0.6
4,I don't like feeling useless.,Negation,Factual,Sadness,-0.5


In [16]:
# textblob library to calculate the subjectivity of each sentence
df['subjectivity'] = df['Sentence'].apply(lambda x: TextBlob(x).sentiment.subjectivity)
df.head()


Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment,polarity,subjectivity
0,I despise waking up early.,Affirmation,Subjective,Anger,0.1,0.3
1,Chocolate is my favorite dessert.,Affirmation,Subjective,Happiness,0.5,1.0
2,I never thought I'd feel this heartbroken.,Negation,Subjective,Sadness,0.0,0.0
3,Childhood memories warm my heart.,Affirmation,Subjective,Happiness,0.6,0.6
4,I don't like feeling useless.,Negation,Factual,Sadness,-0.5,0.2


In [17]:
df

Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment,polarity,subjectivity
0,I despise waking up early.,Affirmation,Subjective,Anger,0.100000,0.3
1,Chocolate is my favorite dessert.,Affirmation,Subjective,Happiness,0.500000,1.0
2,I never thought I'd feel this heartbroken.,Negation,Subjective,Sadness,0.000000,0.0
3,Childhood memories warm my heart.,Affirmation,Subjective,Happiness,0.600000,0.6
4,I don't like feeling useless.,Negation,Factual,Sadness,-0.500000,0.2
...,...,...,...,...,...,...
2442,"Their unmitigated, blatant disrespect is makin...",Affirmation,Subjective,Anger,-0.500000,0.5
2443,I don’t like cold weather.,Negation,Subjective,Sadness,-0.600000,1.0
2444,Sharks don’t live in freshwater.,Negation,Factual,Neutral,0.136364,0.5
2446,I can't stand cold coffee.,Negation,Subjective,Anger,-0.600000,1.0


In [18]:
# implement textblob to make the polarity and subjectivity columns more readable
def sentiment(x):
    if x < 0:
        return 'Negative'
    elif x == 0:
        return 'Neutral'
    else:
        return 'Positive'
    
df['polarity'] = df['polarity'].apply(lambda x: sentiment(x))
df['subjectivity'] = df['subjectivity'].apply(lambda x: sentiment(x))
df.head()

Unnamed: 0,Sentence,Type,Factual/Subjective,Sentiment,polarity,subjectivity
0,I despise waking up early.,Affirmation,Subjective,Anger,Positive,Positive
1,Chocolate is my favorite dessert.,Affirmation,Subjective,Happiness,Positive,Positive
2,I never thought I'd feel this heartbroken.,Negation,Subjective,Sadness,Neutral,Neutral
3,Childhood memories warm my heart.,Affirmation,Subjective,Happiness,Positive,Positive
4,I don't like feeling useless.,Negation,Factual,Sadness,Negative,Positive


In [11]:
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [26]:
# Load dataset
csv_path = "dataset.csv"  # Update this path
df = pd.read_csv(csv_path)

In [12]:
# Initialize sentence embedding model
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

# Encode sentences into embeddings
embeddings = model.encode(df["Sentence"].tolist())

# Label encoding
type_mapping = {"Affirmation": 0, "Negation": 1}
fact_subj_mapping = {"Factual": 0, "Subjective": 1}
sentiment_mapping = {"Sadness": 0, "Anger": 1, "Neutral": 2, "Happiness": 3, "Euphoria": 4}

df["Type"] = df["Type"].map(type_mapping).fillna(-1).astype(int)
df["Factual/Subjective"] = df["Factual/Subjective"].map(fact_subj_mapping).fillna(-1).astype(int)
df["Sentiment"] = df["Sentiment"].map(sentiment_mapping).fillna(-1).astype(int)

# Convert labels to NumPy arrays
type_labels = df["Type"].values
fact_subj_labels = df["Factual/Subjective"].values
sentiment_labels = df["Sentiment"].values

In [13]:
# Stratified Train-Test Split (based on Sentiment)
X_train, X_test, y_type_train, y_type_test, y_fact_train, y_fact_test, y_sent_train, y_sent_test = train_test_split(
    embeddings, type_labels, fact_subj_labels, sentiment_labels,
    test_size=0.2, random_state=42, stratify=sentiment_labels  # Ensuring class balance
)

In [14]:
# Hyperparameter Grid for XGBoost
param_grid = {
    "n_estimators": [50, 100, 200, 500],
    "max_depth": [3, 5, 7, 10],
    "learning_rate": [0.01, 0.1, 0.3],
    "subsample": [0.7, 1.0]
}

In [15]:
# Function to train and evaluate XGBoost with hyperparameter search
def train_xgboost(X_train, y_train, X_test, y_test, name):
    model = XGBClassifier(eval_metric="mlogloss")
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"Best XGBoost Model for {name}: {grid_search.best_params_}")
    print(f"Accuracy for {name}: {acc:.4f}\n")

    return best_model

In [16]:
# Train and evaluate XGBoost models
type_model = train_xgboost(X_train, y_type_train, X_test, y_type_test, "Type Classification")

Best XGBoost Model for Type Classification: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
Accuracy for Type Classification: 0.9451



In [17]:
fact_model = train_xgboost(X_train, y_fact_train, X_test, y_fact_test, "Factual/Subjective Classification")



Best XGBoost Model for Factual/Subjective Classification: {'learning_rate': 0.3, 'max_depth': 7, 'n_estimators': 500, 'subsample': 1.0}
Accuracy for Factual/Subjective Classification: 0.9260



In [18]:
sentiment_model = train_xgboost(X_train, y_sent_train, X_test, y_sent_test, "Sentiment Classification")

Best XGBoost Model for Sentiment Classification: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500, 'subsample': 0.7}
Accuracy for Sentiment Classification: 0.8663

