### NRC Lexicon

Load dataset

In [1]:
import pandas as pd

# load the dataset
dataset_path = 'amazon_sales_2023_cleaned.csv'
df = pd.read_csv(dataset_path)

# define the categories to select   Health and Personal Care, Electronics, Home and Kitchen, Toys and Games, Clothing Shoes and Jewelry
selected_categories = ['Clothing Shoes and Jewelry']
df = df[df['category'].isin(selected_categories)]

# define the categories to select   positive neutral negative
selected_sentiment = ['negative']
df = df[df['sentiment'].isin(selected_sentiment)]

MLflow Server

In [2]:
import mlflow
from mlflow.tracking import MlflowClient

# define the model name
model_name = "NRCLex"

# start MLflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("sentiment-emotion")

# initialize MLflow client
client = MlflowClient()

# experiment ID
experiment_id = client.get_experiment_by_name("sentiment-emotion").experiment_id

runs = client.search_runs(experiment_ids=[experiment_id])

# initial version to 0
max_version = 0

# find the max version for the current version model
for run in runs:
    run_name = run.data.tags.get('mlflow.runName')
    if run_name and run_name.startswith(model_name):        
        # extract version number from the run name
        try:
            version = int(run_name.split('_v')[-1])
        except ValueError:
            continue  # skip if version is not an integer

        # update max_version if this version is greater
        max_version = max(max_version, version)

# increase the version by adding 1
new_version = max_version + 1

# new run name
new_run_name = f"{model_name}_v{new_version}"
mlflow.start_run(run_name=new_run_name)

mlflow.log_param("model name", model_name)
mlflow.log_param("dataset_name", dataset_path)
mlflow.log_param("category", selected_categories)
mlflow.log_param("sentiment", selected_sentiment)
mlflow.log_param("data size", df.shape)

(49969, 5)

NRC algorithm

In [3]:
from nrclex import NRCLex

# NRC lexicon
def classify_emotion(text):
    
    emotion_obj = NRCLex(text)
    emotions = emotion_obj.raw_emotion_scores

    # filter out positive and negative from the emotion list
    core_emotions = {key: value for key, value in emotions.items() if key not in ['positive', 'negative']}
    
    if not core_emotions:
        return None 
    
    # get the highest score of dominant emotion
    dominant_emotion = max(core_emotions, key=core_emotions.get)
    return dominant_emotion

# # Function to classify emotions based on NRC lexicon
# def classify_emotion(text, threshold=0.1, return_score=False):
#     emotion_obj = NRCLex(text)
    
#     # Get the emotion scores and filter out 'positive' and 'negative'
#     emotions = {k: v for k, v in emotion_obj.raw_emotion_scores.items() if k not in ['positive', 'negative']}
    
#     if not emotions:
#         return None  # Return None if no core emotions are found
    
#     # Normalize scores to sum up to 1 for easier interpretation
#     total_score = sum(emotions.values())
#     normalized_emotions = {k: v / total_score for k, v in emotions.items()} if total_score > 0 else emotions

#     # Filter emotions below the threshold (optional)
#     filtered_emotions = {k: v for k, v in normalized_emotions.items() if v >= threshold}

#     if not filtered_emotions:
#         return None  # Return None if no emotions meet the threshold
    
#     # Identify dominant emotion(s) with the highest score
#     max_score = max(filtered_emotions.values())
#     dominant_emotions = [k for k, v in filtered_emotions.items() if v == max_score]
    
#     # If return_score is True, return the dominant emotion(s) and their score(s)
#     if return_score:
#         return {emotion: filtered_emotions[emotion] for emotion in dominant_emotions}
    
#     # Return single emotion if there's only one dominant emotion, otherwise a list of emotions
#     return dominant_emotions[0] if len(dominant_emotions) == 1 else dominant_emotions

# apply to each reviews
df['dominant_emotion'] = df['reviewText'].apply(classify_emotion)

print(df[['reviewText', 'dominant_emotion']].head())

                                               reviewText dominant_emotion
599540  spyglass made china original boy spyglass buy ...          disgust
599541  ordered lens fade got plain black much large l...            trust
599542  material pack cotton boy short stretch much ma...          disgust
599543  product evening one item missing instruction t...          sadness
599544  sized return get different style still love mi...              joy


Log top 3 emotion to mlflow

In [4]:
# log top3 emotions
top_emotions = df['dominant_emotion'].value_counts().nlargest(3).index.tolist()
top_emotions_str = ", ".join(top_emotions)

mlflow.log_param("top_emotions", top_emotions_str)

'anticipation, trust, sadness'

Testing

In [5]:
# from nrclex import NRCLex

# # The sentence to analyze
# text = "My 4 yr old daughter got this doll for Christmas b/c she loves play with hair & make-up. However after about a week, the hair was so unmanageable that she has not been able to play with it since. I have tried several products...conditioner, fabric softener, etc...to try to detangle the hair, but nothing works. This product was a huge disappointment & a huge waste of money. I do NOT recommend it."

# # Function to classify emotions based on NRC lexicon
# def classify_emotion(text):
#     emotion_obj = NRCLex(text)
#     # Get the emotion scores
#     emotions = emotion_obj.raw_emotion_scores

#     # Filter out 'positive' and 'negative' from the emotion list
#     core_emotions = {key: value for key, value in emotions.items() if key not in ['positive', 'negative']}
    
#     if not core_emotions:
#         return None  # If no emotion is found, return None
    
#     # Find the dominant emotion with the highest score
#     dominant_emotion = max(core_emotions, key=core_emotions.get)
#     return dominant_emotion

# # Detect the dominant emotion in the sentence
# dominant_emotion = classify_emotion(text)
# print("Dominant Emotion:", dominant_emotion)
