In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the dataset
emotion_data = pd.read_csv("DataSet/emotion_recog.csv")

# Preprocess the text data
text_content = emotion_data['content']
sentiment = emotion_data['sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(text_content, sentiment, test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train Multiclass Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000)
logreg_model.fit(X_train_vect, y_train)

# Predict using the Logistic Regression model
y_pred_logreg = logreg_model.predict(X_test_vect)

# Calculate accuracy for Logistic Regression
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Accuracy using Multiclass Logistic Regression:", accuracy_logreg)

# Train Gaussian Naïve Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train_vect.toarray(), y_train)

# Predict using the Gaussian Naïve Bayes model
y_pred_nb = nb_model.predict(X_test_vect.toarray())

# Calculate accuracy for Gaussian Naïve Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print("Accuracy using Gaussian Naïve Bayes:", accuracy_nb)


Accuracy using Multiclass Logistic Regression: 0.3333333333333333
Accuracy using Gaussian Naïve Bayes: 0.27941176470588236


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the dataset
data = pd.read_csv("DataSet/emotion_recog.csv")

# Filter emotions with at least 50 tags
filtered_data = data.groupby('sentiment').filter(lambda x: len(x) >= 50)

# Preprocess the text data
X = filtered_data['content']
y = filtered_data['sentiment']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a pipeline for Logistic Regression with text preprocessing
logreg_pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression())
])

# Define parameters for grid search
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],  # unigrams or bigrams
    'tfidf__use_idf': (True, False),
    'clf__max_iter': [100, 500, 1000]
}

# Perform grid search to find the best parameters
grid_search = GridSearchCV(logreg_pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Train the best model
best_model.fit(X_train, y_train)

# Predict using the best model
y_pred = best_model.predict(X_test)

# Calculate accuracy for the best model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy using Multiclass Logistic Regression (with Grid Search):", accuracy)


Accuracy using Multiclass Logistic Regression (with Grid Search): 0.45294117647058824
