In [55]:
pip install nltk pandas requests

Note: you may need to restart the kernel to use updated packages.


In [57]:
import nltk
nltk.download('punkt')       # For sentence and word tokenization
nltk.download('wordnet')     # For accessing WordNet
nltk.download('omw-1.4')    # Open Multilingual Wordnet (needed for some WordNet functions)
nltk.download('averaged_perceptron_tagger')  # For part-of-speech tagging
nltk.download('punkt_tab')  # Download punkt_tab resource

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\menno\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\menno\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\menno\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\menno\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\menno\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [70]:
import requests
import io
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string

# --- Step 2: Load and Inspect the Emotion Lexicon ---

def load_nrc_lexicon(file_path=None):
    """Loads the NRC Emotion Lexicon from a local file and returns a DataFrame.
       Handles both XLSX and TSV/CSV formats.
    """
    if file_path is None:
        raise ValueError("Must provide a file path.")

    try:
        # Load from a local file
        if file_path.endswith('.xlsx'):
            excel_file = pd.ExcelFile(file_path)
            print("Available sheets:", excel_file.sheet_names)
            sheet_name = excel_file.sheet_names[0] # Get the *actual* first sheet name
            print(f"Using sheet: {sheet_name}")
            df = excel_file.parse(sheet_name)
        elif file_path.endswith('.txt') or file_path.endswith('.csv'):
            df = pd.read_csv(filepath_or_buffer=file_path,sep='\t', header=0) # Added header=0 for CSV with header row

            if len(df.columns) < 11:
              print(f"Warning the file has an invalid amount of columns: {len(df.columns)} expected at least 11")
              return None
        else:
            print("Unsupported file format.  Please provide a .xlsx or .tsv/.csv file.")
            return None



        # --- Common Preprocessing (after loading) ---

        # 1. Handle inconsistent column names (KEY FIX)
        # We'll rename the columns to a consistent set, *regardless* of
        # whether they have spaces, parentheses, etc.
        column_mapping = {
            'English (en)': 'word',
            'English': 'word',  # Handle case where it's just 'English'
            'English Word': 'word',  # ***CORRECT MAPPING***
             # Add other variations if needed, based on Step 1 output
            'Positive': 'positive',
            'Negative': 'negative',
            'Anger': 'anger',
            'Anticipation': 'anticipation',
            'Disgust': 'disgust',
            'Fear': 'fear',
            'Joy': 'joy',
            'Sadness': 'sadness',
            'Surprise': 'surprise',
            'Trust': 'trust'
        }

        # Rename columns, only if they exist in the DataFrame
        for original, new in column_mapping.items():
            if original in df.columns:
                df = df.rename(columns={original: new})

        # 2. Filter for English Words (if the column exists) and remove rows with missing 'word'
        if 'word' in df.columns:
            # Check if other language identifier columns also exist
            if 'English (en)' in df.columns:
                df = df[df['English (en)'] == 1]  # Keep only English words
            df = df[df['word'].notna()]  # Drop rows with missing 'word' values
        else:
            print("Error: 'word' column not found after renaming.")
            return None

        # 3. Select only the required columns.
        required_columns = ['word', 'positive', 'negative', 'anger', 'anticipation',
                         'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
        # Get a list of the columns present in the dataframe, from the required columns.
        existing_columns = [col for col in required_columns if col in df.columns]

        df = df[existing_columns]



        return df  # Return the DataFrame

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def preprocess_text(text):
    """Lowercase, tokenize, remove stop words and punctuation, and lemmatize."""
    try:
        # Explicitly load the Punkt sentence tokenizer
        sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

        # Tokenize into sentences, *then* into words.
        sentences = sent_tokenizer.tokenize(text.lower())
        tokens = []
        for sent in sentences:
            words = word_tokenize(sent, language='english')  # Pass language here
            tokens.extend(words)

        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return tokens

    except LookupError as e:
        print(f"LookupError in preprocess_text: {e}")
        #Print helpful information:
        print(f"NLTK Data Path: {nltk.data.path}")
        import os
        print(f"NLTK_DATA environment variable: {os.environ.get('NLTK_DATA')}")
        print(f"Does the punkt file exist where expected? {os.path.exists(nltk.data.find('tokenizers/punkt/PY3/english.pickle'))}")
        return []  # Return an empty list on error
    except Exception as e:
        print(f"Unexpected error in preprocess_text: {e}")
        return []


def analyze_sentiment(text, lexicon_df):
    """Analyzes the sentiment of a text using the loaded lexicon data.

    Args:
        text (str): The text to analyze.
        lexicon_df (pd.DataFrame): The emotion lexicon DataFrame.

    Returns:
        dict: A dictionary of emotion scores for the text.
    """
    tokens = preprocess_text(text)
    emotion_scores = {
        'positive': 0,
        'negative': 0,
        'anger': 0,
        'anticipation': 0,
        'disgust': 0,
        'fear': 0,
        'joy': 0,
        'sadness': 0,
        'surprise': 0,
        'trust': 0
    }

    if lexicon_df.empty:
        print("Warning: Lexicon is empty. Returning zero scores.")
        return emotion_scores

    for word in tokens:
        # Find *all* rows matching the word (case-insensitive)
        # Convert the word column to lowercase for matching, and match word with lowercase.
        matching_rows = lexicon_df[lexicon_df['word'].str.lower() == word.lower()]

        for _, row in matching_rows.iterrows():  # Iterate through matching rows
            for emotion in emotion_scores.keys():
                # Get the emotion score directly from the DataFrame
                emotion_scores[emotion] += int(row[emotion])  # Ensure it's an integer

    return emotion_scores

def expand_lexicon(lexicon_df):
    """Expands the lexicon DataFrame with synonyms from WordNet."""
    new_rows = []  # List to store new rows

    for _, row in lexicon_df.iterrows():  # Iterate over rows directly
        word = row['word']
        # Check if the word is a string
        if isinstance(word, str):
            emotions = row.drop('word').to_dict() #drop word, to iterate over emotions
            for synset in wordnet.synsets(word):
                for lemma in synset.lemmas():
                    lemma_name = lemma.name().replace("_", " ")  # Clean up lemma name
                    # Check if the lemma already exists (case-insensitive)
                    if lemma_name.lower() not in lexicon_df['word'].str.lower().values:
                      new_row = {'word': lemma_name}
                      new_row.update(emotions) # add all emotions columns
                      new_rows.append(new_row) #add to the new rows

    # Create a DataFrame from the new rows
    new_df = pd.DataFrame(new_rows)
    # Concatenate the original DataFrame with the new rows, ignore original index.
    expanded_lexicon_df = pd.concat([lexicon_df, new_df], ignore_index=True)
    expanded_lexicon_df = expanded_lexicon_df.drop_duplicates(subset=['word'], keep='first')
    return expanded_lexicon_df

# --- Main Program ---

# Local file path (USE THIS FOR NOW)
file_path = r"C:\Users\menno\Source\Repos\ML Emotions\NRC-Emotion-Lexicon-v0.92-InManyLanguages-web.xlsx"
emotion_lexicon_df = load_nrc_lexicon(file_path=file_path)

if emotion_lexicon_df is not None:
    # LIMIT TO A SAMPLE FOR TESTING
    sample_lexicon_df = emotion_lexicon_df.head(100)  # First 100 rows
    #sample_lexicon_df = emotion_lexicon_df.sample(n=100) # you can use random sample instead of head.

    print(f"Original Lexicon Size: {len(emotion_lexicon_df)}")
    print(f"Sample Lexicon Size: {len(sample_lexicon_df)}")

    expanded_lexicon_df = expand_lexicon(sample_lexicon_df)  # Use the SAMPLE
    print(f"Expanded Lexicon Size: {len(expanded_lexicon_df)}")

    # Example Usage (using the sample)
    text1 = "This is a wonderfully happy and joyful day!"
    text2 = "I am feeling sad, angry, and filled with fear."
    text3 = "The movie was okay.  It wasn't amazing, but not terrible."

    scores1 = analyze_sentiment(text1, expanded_lexicon_df)  # Use expanded lexicon
    scores2 = analyze_sentiment(text2, expanded_lexicon_df)
    scores3 = analyze_sentiment(text3, expanded_lexicon_df)

    print(f"Text 1 Scores: {scores1}")
    print(f"Text 2 Scores: {scores2}")
    print(f"Text 3 Scores: {scores3}")

    print(emotion_lexicon_df[emotion_lexicon_df['word'] == 'grief'])
    print(expanded_lexicon_df[expanded_lexicon_df['word'] == 'sorrow'])

else:
    print("Failed to load emotion lexicon.")

Available sheets: ['NRC-Emotion-Lexicon-v0.92-InMan']
Using sheet: NRC-Emotion-Lexicon-v0.92-InMan


  warn(msg)


Original Lexicon Size: 14181
Sample Lexicon Size: 100
Expanded Lexicon Size: 521
Text 1 Scores: {'positive': 0, 'negative': 0, 'anger': 0, 'anticipation': 0, 'disgust': 0, 'fear': 0, 'joy': 0, 'sadness': 0, 'surprise': 0, 'trust': 0}
Text 2 Scores: {'positive': 0, 'negative': 0, 'anger': 0, 'anticipation': 0, 'disgust': 0, 'fear': 0, 'joy': 0, 'sadness': 0, 'surprise': 0, 'trust': 0}
Text 3 Scores: {'positive': 0, 'negative': 1, 'anger': 0, 'anticipation': 0, 'disgust': 1, 'fear': 1, 'joy': 0, 'sadness': 0, 'surprise': 0, 'trust': 0}
       word  positive  negative  anger  anticipation  disgust  fear  joy  \
5627  grief         0         1      0             0        0     0    0   

      sadness  surprise  trust  
5627        1         0      0  
Empty DataFrame
Columns: [word, positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust]
Index: []


In [71]:
# ... (All your import statements) ...

# ... (Your load_nrc_lexicon function - as in the previous, corrected code) ...
# ... (Your expand_lexicon function - as in the previous, corrected code) ...
# NO NEED TO REDEFINE THESE FUNCTIONS EVERY TIME

# --- Lexicon Creation (Run ONCE) ---
#
# Load the original lexicon from your file.  Use *either* the URL
# or the file_path, but NOT both at the same time.
#
# FOR URL (if you get the server issues sorted out):
# emotion_lexicon_df = load_nrc_lexicon(url="https://saifmohammad.com/WebDocs/NRC-Emotion-Lexicon-v0.92-InManyLanguages-web.xlsx")

# FOR LOCAL FILE (use this for now):
emotion_lexicon_df = load_nrc_lexicon(file_path=r"C:\Users\menno\Source\Repos\ML Emotions\NRC-Emotion-Lexicon-v0.92-InManyLanguages-web.xlsx")

if emotion_lexicon_df is not None:
    print(f"Original Lexicon Size: {len(emotion_lexicon_df)}")

    # Expand with synonyms (this is the slow part)
    expanded_lexicon_df = expand_lexicon(emotion_lexicon_df)
    print(f"Expanded Lexicon Size: {len(expanded_lexicon_df)}")

    # Save the expanded lexicon to a CSV file
    expanded_lexicon_df.to_csv("expanded_nrc_lexicon.csv", index=False)
    print("Expanded lexicon saved to expanded_nrc_lexicon.csv")

else:
    print("Failed to load the emotion lexicon.")

Available sheets: ['NRC-Emotion-Lexicon-v0.92-InMan']
Using sheet: NRC-Emotion-Lexicon-v0.92-InMan


  warn(msg)


Original Lexicon Size: 14181
Expanded Lexicon Size: 35348
Expanded lexicon saved to expanded_nrc_lexicon.csv


In [76]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# --- Function Definitions (Keep these) ---

def preprocess_text(text):
    """Lowercase, tokenize, remove stop words and punctuation, and lemmatize."""
    try:
        # Explicitly load the Punkt sentence tokenizer
        sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

        # Tokenize into sentences, *then* into words.  PASS THE TOKENIZER!
        sentences = sent_tokenizer.tokenize(text.lower())
        tokens = []
        for sent in sentences:
            words = word_tokenize(sent, language='english')  # Pass language here
            tokens.extend(words)

        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return tokens

    except LookupError as e:
        print(f"LookupError in preprocess_text: {e}")
        #Print helpful information:
        print(f"NLTK Data Path: {nltk.data.path}")
        import os
        print(f"NLTK_DATA environment variable: {os.environ.get('NLTK_DATA')}")
        print(f"Does the punkt file exist where expected? {os.path.exists(nltk.data.find('tokenizers/punkt/PY3/english.pickle'))}")
        return []  # Return an empty list on error
    except Exception as e:
        print(f"Unexpected error in preprocess_text: {e}")
        return []


def analyze_sentiment(text, lexicon_df):
    """Analyzes the sentiment of a text using the loaded lexicon data.

    Args:
        text (str): The text to analyze.
        lexicon_df (pd.DataFrame): The emotion lexicon DataFrame.

    Returns:
        dict: A dictionary of emotion scores for the text.
    """
    tokens = preprocess_text(text)
    emotion_scores = {
        'positive': 0,
        'negative': 0,
        'anger': 0,
        'anticipation': 0,
        'disgust': 0,
        'fear': 0,
        'joy': 0,
        'sadness': 0,
        'surprise': 0,
        'trust': 0
    }

    if lexicon_df.empty:
        print("Warning: Lexicon is empty. Returning zero scores.")
        return emotion_scores

    for word in tokens:
        # Case-insensitive check if the word exists in the lexicon
        if word.lower() in lexicon_df['word'].str.lower().values:
            word_row = lexicon_df[lexicon_df['word'].str.lower() == word.lower()].iloc[0]
            for emotion in emotion_scores.keys():
                emotion_scores[emotion] += int(word_row[emotion])

    return emotion_scores
# --- Main Program: Sentiment Analysis ---

# Load the *expanded* lexicon from the CSV file
expanded_lexicon_df = pd.read_csv("expanded_nrc_lexicon.csv")  # Load the SAVED lexicon

# Example Usage (using the sample)
text1 = "This is a wonderfully happy and joyful day!"
text2 = "I am feeling sad, angry, and filled with fear."
text3 = "The movie was okay.  It wasn't amazing, but not terrible."
text4 = "The unexpected gift filled me with joy and surprise! I was so grateful."
text5 = "He felt abandoned and betrayed by his closest friends.  The injustice of it all made him furious."
text6 = "The looming deadline and the overwhelming workload created a sense of dread and anxiety."

scores1 = analyze_sentiment(text1, expanded_lexicon_df)  # Use expanded lexicon
scores2 = analyze_sentiment(text2, expanded_lexicon_df)
scores3 = analyze_sentiment(text3, expanded_lexicon_df)
scores4 = analyze_sentiment(text4, expanded_lexicon_df)
scores5 = analyze_sentiment(text5, expanded_lexicon_df)
scores6 = analyze_sentiment(text6, expanded_lexicon_df)


print(f"Text 1 Scores: {scores1}")
print(f"Text 2 Scores: {scores2}")
print(f"Text 3 Scores: {scores3}")
print(f"Text 4 Scores: {scores4}")
print(f"Text 5 Scores: {scores5}")
print(f"Text 6 Scores: {scores6}")


print(emotion_lexicon_df[emotion_lexicon_df['word'] == 'grief'])
print(expanded_lexicon_df[expanded_lexicon_df['word'] == 'sorrow'])

Text 1 Scores: {'positive': 3, 'negative': 0, 'anger': 0, 'anticipation': 1, 'disgust': 0, 'fear': 0, 'joy': 3, 'sadness': 0, 'surprise': 1, 'trust': 2}
Text 2 Scores: {'positive': 1, 'negative': 4, 'anger': 4, 'anticipation': 1, 'disgust': 3, 'fear': 3, 'joy': 1, 'sadness': 2, 'surprise': 1, 'trust': 1}
Text 3 Scores: {'positive': 1, 'negative': 2, 'anger': 2, 'anticipation': 0, 'disgust': 2, 'fear': 2, 'joy': 1, 'sadness': 2, 'surprise': 0, 'trust': 1}
Text 4 Scores: {'positive': 5, 'negative': 1, 'anger': 0, 'anticipation': 2, 'disgust': 0, 'fear': 2, 'joy': 4, 'sadness': 0, 'surprise': 3, 'trust': 0}
Text 5 Scores: {'positive': 1, 'negative': 3, 'anger': 3, 'anticipation': 0, 'disgust': 1, 'fear': 1, 'joy': 1, 'sadness': 1, 'surprise': 0, 'trust': 1}
Text 6 Scores: {'positive': 2, 'negative': 2, 'anger': 1, 'anticipation': 2, 'disgust': 0, 'fear': 2, 'joy': 0, 'sadness': 1, 'surprise': 0, 'trust': 0}
       word  positive  negative  anger  anticipation  disgust  fear  joy  \
5627  

Here's a plan to do that, combining best practices for both version control and continued development:

1. Save Point (Working Code - Local File Loading):

This is the complete, working code that loads the lexicon from a local Excel file, expands it with synonyms, and performs basic sentiment analysis with negation handling. This is your stable baseline.

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
import os

# --- Function Definitions ---

def load_nrc_lexicon(file_path=None):
    """Loads the NRC Emotion Lexicon from a local file and returns a DataFrame.
       Handles both XLSX and TSV/CSV formats.
    """
    if file_path is None:
        raise ValueError("Must provide a file path.")

    try:
        # Load from a local file
        if file_path.endswith('.xlsx'):
            excel_file = pd.ExcelFile(file_path)
            print("Available sheets:", excel_file.sheet_names)
            sheet_name = excel_file.sheet_names[0] # Get the *actual* first sheet name
            print(f"Using sheet: {sheet_name}")
            df = excel_file.parse(sheet_name)
        elif file_path.endswith('.txt') or file_path.endswith('.csv'):
            df = pd.read_csv(filepath_or_buffer=file_path,sep='\t', header=0) # Added header=0 for CSV with header row

            if len(df.columns) < 11:
              print(f"Warning the file has an invalid amount of columns: {len(df.columns)} expected at least 11")
              return None
        else:
            print("Unsupported file format.  Please provide a .xlsx or .tsv/.csv file.")
            return None



        # --- Data Cleaning and Preparation ---

        # 1. Handle inconsistent column names (KEY FIX)
        # We'll rename the columns to a consistent set, *regardless* of
        # whether they have spaces, parentheses, etc.
        column_mapping = {
            'English (en)': 'word',
            'English': 'word',  # Handle case where it's just 'English'
            'English Word': 'word',  # ***CORRECT MAPPING***
             # Add other variations if needed, based on Step 1 output
            'Positive': 'positive',
            'Negative': 'negative',
            'Anger': 'anger',
            'Anticipation': 'anticipation',
            'Disgust': 'disgust',
            'Fear': 'fear',
            'Joy': 'joy',
            'Sadness': 'sadness',
            'Surprise': 'surprise',
            'Trust': 'trust'
        }

        # Rename columns, only if they exist in the DataFrame
        for original, new in column_mapping.items():
            if original in df.columns:
                df = df.rename(columns={original: new})

        # 2. Filter for English Words (if the column exists) and remove rows with missing 'word'
        if 'word' in df.columns:
            # Check if other language identifier columns also exist
            if 'English (en)' in df.columns:
                df = df[df['English (en)'] == 1]  # Keep only English words
            df = df[df['word'].notna()]  # Drop rows with missing 'word' values
        else:
            print("Error: 'word' column not found after renaming.")
            return None

        # 3. Select only the required columns.
        required_columns = ['word', 'positive', 'negative', 'anger', 'anticipation',
                         'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
        # Get a list of the columns present in the dataframe, from the required columns.
        existing_columns = [col for col in required_columns if col in df.columns]

        df = df[existing_columns]



        return df  # Return the DataFrame

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def preprocess_text(text):
    """Lowercase, tokenize, remove stop words and punctuation, and lemmatize."""
    try:
        # Explicitly load the Punkt sentence tokenizer
        sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

        # Tokenize into sentences, *then* into words.
        sentences = sent_tokenizer.tokenize(text.lower())
        tokens = []
        for sent in sentences:
            words = word_tokenize(sent, language='english')  # Pass language here
            tokens.extend(words)

        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return tokens

    except LookupError as e:
        print(f"LookupError in preprocess_text: {e}")
        #Print helpful information:
        print(f"NLTK Data Path: {nltk.data.path}")
        import os
        print(f"NLTK_DATA environment variable: {os.environ.get('NLTK_DATA')}")
        print(f"Does the punkt file exist where expected? {os.path.exists(nltk.data.find('tokenizers/punkt/PY3/english.pickle'))}")
        return []  # Return an empty list on error
    except Exception as e:
        print(f"Unexpected error in preprocess_text: {e}")
        return []



def analyze_sentiment(text, lexicon_df):
    """Analyzes the sentiment of a text using the loaded lexicon data."""
    tokens = preprocess_text(text)
    print(f"Final tokens for analysis: {tokens}")  # Debugging print

    emotion_scores = {
        'positive': 0,
        'negative': 0,
        'anger': 0,
        'anticipation': 0,
        'disgust': 0,
        'fear': 0,
        'joy': 0,
        'sadness': 0,
        'surprise': 0,
        'trust': 0
    }

    if lexicon_df.empty:
        print("Warning: Lexicon is empty. Returning zero scores.")
        return emotion_scores

    for word in tokens:
        # ***CRITICAL FIX: Lowercase BOTH the word and the lexicon words***
        word_lower = word.lower()
        matching_rows = lexicon_df[lexicon_df['word'].str.lower() == word_lower]

        # Iterate through the matching row(s) and sum the scores
        for _, row in matching_rows.iterrows():
            for emotion in emotion_scores.keys():
                try:
                    emotion_scores[emotion] += int(row[emotion])
                except KeyError as e:
                  print(f"KeyError: {e} not found in DataFrame. Check your column names!")
                  return {} #return empty dictionary on error.
                except ValueError as e:
                  print("ValueError")

    return emotion_scores

def expand_lexicon(lexicon_df):
    """Expands the lexicon DataFrame with synonyms from WordNet."""
    new_rows = []  # List to store new rows

    for _, row in lexicon_df.iterrows():  # Iterate over rows directly
        word = row['word']
        # Check if the word is a string
        if isinstance(word, str):
            emotions = row.drop('word').to_dict() #drop word, to iterate over emotions
            for synset in wordnet.synsets(word):
                for lemma in synset.lemmas():
                    lemma_name = lemma.name().replace("_", " ")  # Clean up lemma name
                    # Check if the lemma already exists (case-insensitive)
                    if lemma_name.lower() not in lexicon_df['word'].str.lower().values:
                      new_row = {'word': lemma_name}
                      new_row.update(emotions) # add all emotions columns
                      new_rows.append(new_row) #add to the new rows

    # Create a DataFrame from the new rows
    new_df = pd.DataFrame(new_rows)
    # Concatenate the original DataFrame with the new rows, ignore original index.
    expanded_lexicon_df = pd.concat([lexicon_df, new_df], ignore_index=True)
    expanded_lexicon_df = expanded_lexicon_df.drop_duplicates(subset=['word'], keep='first')
    return expanded_lexicon_df

# --- Main Program ---

# 1. Load Lexicon (from local file within Fabric)
file_path = r"C:\Users\menno\Source\Repos\ML Emotions\NRC-Emotion-Lexicon-v0.92-InManyLanguages-web.xlsx"  # Correct path within "Files"
emotion_lexicon_df = load_nrc_lexicon(file_path=file_path)

if emotion_lexicon_df is not None:
    # 2. Limit for Testing (Optional - remove for full processing)
    #sample_lexicon_df = emotion_lexicon_df.head(100)  # First 100 rows #Removed to use full dataset
    #sample_lexicon_df = emotion_lexicon_df.sample(n=100) # you can use random sample instead of head.

    print(f"Original Lexicon Size: {len(emotion_lexicon_df)}")
    #print(f"Sample Lexicon Size: {len(sample_lexicon_df)}") #Removed sample size

    # 3. Expand Lexicon (Optional - you can skip this initially for even faster testing)
    expanded_lexicon_df = expand_lexicon(emotion_lexicon_df)  # Use the SAMPLE
    print(f"Expanded Lexicon Size: {len(expanded_lexicon_df)}")

    # 4. Example Usage (using the *expanded* lexicon)
    text1 = "This is a wonderfully happy and joyful day!"
    text2 = "I am feeling sad, angry, and filled with fear."
    text3 = "The movie was okay.  It wasn't amazing, but not terrible."
    text4 = "I can't believe I got the promotion, but now I have so much more responsibility."
    text5 = "The movie was incredibly well-made, but it left me feeling empty inside."
    text6 = "She said she loves me, but I don't know if I can trust her anymore."
    text7 = "Winning the lottery was the best thing that ever happened to me, but now everyone wants a piece of my fortune."
    text8 = "I finally finished the marathon, but my legs feel like they're going to fall off."
    text9 = "The concert was amazing, but the crowd was overwhelming."
    text10 = "I got an A on my exam, but I feel like I didn't really learn anything."
    text11 = "He apologized for his mistake, but I still feel hurt."
    text12 = "The new job pays well, but I miss my old colleagues."
    text13 = "I love my new house, but the neighborhood is a bit too quiet for my liking."
    text14 = "I am thrilled to announce that I got the job!"
    text15 = "I am devastated by the loss of my pet."
    text16 = "I am so proud of my daughter's achievements."
    text17 = "I am furious about the unfair treatment I received."
    text18 = "I am anxious about the upcoming exam."
    text19 = "I am grateful for all the support from my friends."
    text20 = "I am disgusted by the behavior I witnessed."
    text21 = "I am excited to travel to a new country."
    text22 = "I am relieved that the surgery went well."
    text23 = "I am disappointed with the results of the project."

    scores1 = analyze_sentiment(text1, expanded_lexicon_df)  # Use expanded lexicon
    scores2 = analyze_sentiment(text2, expanded_lexicon_df)
    scores3 = analyze_sentiment(text3, expanded_lexicon_df)
    scores4 = analyze_sentiment(text4, expanded_lexicon_df)
    scores5 = analyze_sentiment(text5, expanded_lexicon_df)
    scores6 = analyze_sentiment(text6, expanded_lexicon_df)
    scores7 = analyze_sentiment(text7, expanded_lexicon_df)
    scores8 = analyze_sentiment(text8, expanded_lexicon_df)
    scores9 = analyze_sentiment(text9, expanded_lexicon_df)
    scores10 = analyze_sentiment(text10, expanded_lexicon_df)
    scores11 = analyze_sentiment(text11, expanded_lexicon_df)
    scores12 = analyze_sentiment(text12, expanded_lexicon_df)
    scores13 = analyze_sentiment(text13, expanded_lexicon_df)
    scores14 = analyze_sentiment(text14, expanded_lexicon_df)
    scores15 = analyze_sentiment(text15, expanded_lexicon_df)
    scores16 = analyze_sentiment(text16, expanded_lexicon_df)
    scores17 = analyze_sentiment(text17, expanded_lexicon_df)
    scores18 = analyze_sentiment(text18, expanded_lexicon_df)
    scores19 = analyze_sentiment(text19, expanded_lexicon_df)
    scores20 = analyze_sentiment(text20, expanded_lexicon_df)
    scores21 = analyze_sentiment(text21, expanded_lexicon_df)
    scores22 = analyze_sentiment(text22, expanded_lexicon_df)
    scores23 = analyze_sentiment(text23, expanded_lexicon_df)
    print(f"Text 1 Scores: {scores1}")
    print(f"Text 2 Scores: {scores2}")
    print(f"Text 3 Scores: {scores3}")
    print(f"Text 4 Scores: {scores4}")
    print(f"Text 5 Scores: {scores5}")
    print(f"Text 6 Scores: {scores6}")
    print(f"Text 7 Scores: {scores7}")
    print(f"Text 8 Scores: {scores8}")
    print(f"Text 9 Scores: {scores9}")
    print(f"Text 10 Scores: {scores10}")
    print(f"Text 11 Scores: {scores11}")
    print(f"Text 12 Scores: {scores12}")
    print(f"Text 13 Scores: {scores13}")
    print(f"Text 14 Scores: {scores14}")
    print(f"Text 15 Scores: {scores15}")
    print(f"Text 16 Scores: {scores16}")
    print(f"Text 17 Scores: {scores17}")
    print(f"Text 18 Scores: {scores18}")
    print(f"Text 19 Scores: {scores19}")
    print(f"Text 20 Scores: {scores20}")
    print(f"Text 21 Scores: {scores21}")
    print(f"Text 22 Scores: {scores22}")
    print(f"Text 23 Scores: {scores23}")

    print(emotion_lexicon_df[emotion_lexicon_df['word'] == 'grief'])
    print(expanded_lexicon_df[expanded_lexicon_df['word'] == 'sorrow'])

else:
    print("Failed to load emotion lexicon.")

Next steps:

Run the code.

Add a line to save the expanded lexicon to a csv file: expanded_lexicon_df.to_csv("expanded_nrc_lexicon.csv", index=False)

Change the file loading to use this new file.

Summary for Next Prompt:

To move this project forward and start a new, focused prompt, here's a summary of where we are and what the next logical steps would be:

Current Status:

Working Baseline: We have a functional Python script that:

Loads the NRC Emotion Lexicon from a local Excel file.

Preprocesses text data (lowercasing, tokenization, stop word removal, punctuation removal, lemmatization).

Expands the lexicon with synonyms using WordNet.

Calculates basic emotion scores for input text by summing the emotion scores of matching words (and their synonyms) in the lexicon.

Includes basic negation handling (inverting sentiment for words following negation terms).

Runs in a Fabric PySpark notebook environment (though we are not yet fully leveraging Spark's distributed processing capabilities).

Includes basic error handling.

Limitations:

Simple Sentiment Scoring: The current scoring method is a simple sum of emotion scores. This doesn't account for word frequency, sentence structure, or more complex linguistic phenomena.

Basic Negation: Negation handling is rudimentary, only affecting the immediately following word.

No Contextual Understanding: The system treats words in isolation, without considering their context. This leads to inaccuracies with words that have multiple meanings (polysemy) or where sentiment is expressed indirectly (sarcasm, irony).

Pandas DataFrames (Not Fully Spark): We are loading and processing the lexicon using Pandas DataFrames, which are not distributed. While this works for the current lexicon size, it won't scale efficiently to very large text datasets or lexicons. We have a working baseline, then we can move to a fully Spark-based solution.

Next Steps (for a New Prompt - Prioritized):

Here's a prioritized list of next steps, suitable for framing a new prompt:

Improved Negation Handling:

Goal: More accurately handle negation to avoid misinterpreting the sentiment of phrases like "not happy" or "didn't enjoy."

Methods:

Wider Negation Window: Extend the negation effect to more than just the immediately following word (e.g., a window of 2-3 words).

Dependency Parsing: Use a dependency parser (like spaCy) to identify the grammatical relationships between words. This is much more accurate than a fixed window. For example, in "I did not find the movie enjoyable," a dependency parser would correctly link "not" to "enjoyable," even though they are not adjacent.

Example Prompt: "How can I improve the negation handling in my Python sentiment analysis code? I'm currently using a simple flag to invert the sentiment of the next word after a negation word (like 'not'), but this isn't accurate enough. I'd like to explore using a wider window, and ideally, I'd like to use dependency parsing with spaCy to identify the words being negated more accurately. Provide code examples using spaCy, and explain how to integrate it into my existing preprocess_text and analyze_sentiment functions."

Contextual Word Embeddings (BERT):

Goal: Move beyond simple word matching and capture the meaning of words in context.

Method: Use a pre-trained BERT model (or similar transformer-based model) to generate contextual word embeddings.

Example Prompt: "I want to improve my sentiment analysis by using contextual word embeddings. I've heard that BERT is a good choice. How can I integrate a pre-trained BERT model into my existing Python code to generate word embeddings, and how would I use those embeddings to calculate sentiment scores? Provide a code example that shows how to load a BERT model (using the transformers library), get embeddings for words in a sentence, and then use those embeddings, along with my existing emotion lexicon, to calculate sentiment. I want the approach to be compatible with eventual use in a PySpark environment."

Weighted Scoring and Normalization:

Goal: Improve the scoring mechanism to be more nuanced than a simple sum.

Methods:

TF-IDF: Weight words by their Term Frequency-Inverse Document Frequency (TF-IDF). This gives more weight to words that are frequent in a document but relatively rare in the overall corpus.

Normalization: Divide the emotion scores by the total number of (processed) words in the text to account for different text lengths.

Custom Weights: Experiment with assigning different weights to different emotion categories, or to specific words.

Example Prompt: "How can I improve the sentiment scoring in my Python code? Currently, I'm just summing the emotion scores from my lexicon. I'd like to explore weighting words by TF-IDF and normalizing the scores by the length of the text. Show me how to calculate TF-IDF scores for the words in my text and use those scores to weight the emotion scores from the lexicon."

Converting fully to PySpark Dataframes

This allows for larger files and datasets to be processed

Make use of the full functionality of the Fabric environment.

Machine Learning:

Goal: Move beyond lexicon-based analysis to a more powerful, data-driven approach.

Methods: Train a machine learning classifier (e.g., Naive Bayes, SVM, Random Forest, or a neural network) on a labeled dataset of text with known sentiment/emotion labels. Use the lexicon scores, word embeddings, and other features (n-grams, POS tags) as input to the classifier.

Example prompt: What would be the best approach to use machine learning with the current code, so that the emotion of a text is detected.

Choose one of these areas to focus on in your next prompt. Don't try to do everything at once. Start with negation handling, as that's a relatively self-contained improvement that will have a noticeable impact on accuracy. Then, you can move on to more advanced techniques like contextual embeddings and machine learning.

In [7]:
!pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Using cached spacy-3.8.2.tar.gz (1.3 MB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'error'


  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [101 lines of output]
      Ignoring numpy: markers 'python_version < "3.9"' don't match your environment
      Collecting setuptools
        Using cached setuptools-76.0.0-py3-none-any.whl.metadata (6.7 kB)
      Collecting cython<3.0,>=0.25
        Using cached Cython-0.29.37-py2.py3-none-any.whl.metadata (3.1 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Using cached cymem-2.0.11-cp313-cp313-win_amd64.whl.metadata (8.8 kB)
      Collecting preshed<3.1.0,>=3.0.2
        Using cached preshed-3.0.9.tar.gz (14 kB)
        Installing build dependencies: started
        Installing build dependencies: finished with status 'done'
        Getting requirements to build wheel: started
        Getting requirements to build wheel: finished with status 'done'
        Preparing metadata (pyproject.toml): started
        Preparing metadata (pyproject.toml): f

In [6]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string
import os
import spacy  # Import spaCy

# --- Function Definitions ---

def load_nrc_lexicon(file_path=None):
    """Loads the NRC Emotion Lexicon from a local file and returns a DataFrame.
       Handles both XLSX and TSV/CSV formats.
    """
    if file_path is None:
        raise ValueError("Must provide a file path.")

    try:
        # Load from a local file
        if file_path.endswith('.xlsx'):
            excel_file = pd.ExcelFile(file_path)
            print("Available sheets:", excel_file.sheet_names)
            sheet_name = excel_file.sheet_names[0] # Get the *actual* first sheet name
            print(f"Using sheet: {sheet_name}")
            df = excel_file.parse(sheet_name)
        elif file_path.endswith('.txt') or file_path.endswith('.csv'):
            df = pd.read_csv(filepath_or_buffer=file_path,sep='\t', header=0) # Added header=0 for CSV with header row

            if len(df.columns) < 11:
              print(f"Warning the file has an invalid amount of columns: {len(df.columns)} expected at least 11")
              return None
        else:
            print("Unsupported file format.  Please provide a .xlsx or .tsv/.csv file.")
            return None



        # --- Data Cleaning and Preparation ---

        # 1. Handle inconsistent column names (KEY FIX)
        # We'll rename the columns to a consistent set, *regardless* of
        # whether they have spaces, parentheses, etc.
        column_mapping = {
            'English (en)': 'word',
            'English': 'word',  # Handle case where it's just 'English'
            'English Word': 'word',  # ***CORRECT MAPPING***
             # Add other variations if needed, based on Step 1 output
            'Positive': 'positive',
            'Negative': 'negative',
            'Anger': 'anger',
            'Anticipation': 'anticipation',
            'Disgust': 'disgust',
            'Fear': 'fear',
            'Joy': 'joy',
            'Sadness': 'sadness',
            'Surprise': 'surprise',
            'Trust': 'trust'
        }

        # Rename columns, only if they exist in the DataFrame
        for original, new in column_mapping.items():
            if original in df.columns:
                df = df.rename(columns={original: new})

        # 2. Filter for English Words (if the column exists) and remove rows with missing 'word'
        if 'word' in df.columns:
            # Check if other language identifier columns also exist
            if 'English (en)' in df.columns:
                df = df[df['English (en)'] == 1]  # Keep only English words
            df = df[df['word'].notna()]  # Drop rows with missing 'word' values
        else:
            print("Error: 'word' column not found after renaming.")
            return None

        # 3. Select only the required columns.
        required_columns = ['word', 'positive', 'negative', 'anger', 'anticipation',
                         'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust']
        # Get a list of the columns present in the dataframe, from the required columns.
        existing_columns = [col for col in required_columns if col in df.columns]

        df = df[existing_columns]



        return df  # Return the DataFrame

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def preprocess_text(text):
    """Lowercase, tokenize, remove stop words and punctuation, and lemmatize."""
    try:
        # Explicitly load the Punkt sentence tokenizer
        sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

        # Tokenize into sentences, *then* into words.
        sentences = sent_tokenizer.tokenize(text.lower())
        tokens = []
        for sent in sentences:
            words = word_tokenize(sent, language='english')  # Pass language here
            tokens.extend(words)

        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        return tokens

    except LookupError as e:
        print(f"LookupError in preprocess_text: {e}")
        #Print helpful information:
        print(f"NLTK Data Path: {nltk.data.path}")
        import os
        print(f"NLTK_DATA environment variable: {os.environ.get('NLTK_DATA')}")
        print(f"Does the punkt file exist where expected? {os.path.exists(nltk.data.find('tokenizers/punkt/PY3/english.pickle'))}")
        return []  # Return an empty list on error
    except Exception as e:
        print(f"Unexpected error in preprocess_text: {e}")
        return []



def analyze_sentiment(text, lexicon_df):
    """Analyzes the sentiment of a text using the loaded lexicon data."""
    tokens = preprocess_text(text)
    print(f"Final tokens for analysis: {tokens}")  # Debugging print

    emotion_scores = {
        'positive': 0,
        'negative': 0,
        'anger': 0,
        'anticipation': 0,
        'disgust': 0,
        'fear': 0,
        'joy': 0,
        'sadness': 0,
        'surprise': 0,
        'trust': 0
    }

    if lexicon_df.empty:
        print("Warning: Lexicon is empty. Returning zero scores.")
        return emotion_scores

    for word in tokens:
        # ***CRITICAL FIX: Lowercase BOTH the word and the lexicon words***
        word_lower = word.lower()
        matching_rows = lexicon_df[lexicon_df['word'].str.lower() == word_lower]

        # Iterate through the matching row(s) and sum the scores
        for _, row in matching_rows.iterrows():
            for emotion in emotion_scores.keys():
                try:
                    emotion_scores[emotion] += int(row[emotion])
                except KeyError as e:
                  print(f"KeyError: {e} not found in DataFrame. Check your column names!")
                  return {} #return empty dictionary on error.
                except ValueError as e:
                  print("ValueError")

    return emotion_scores

def expand_lexicon(lexicon_df):
    """Expands the lexicon DataFrame with synonyms from WordNet."""
    new_rows = []  # List to store new rows

    for _, row in lexicon_df.iterrows():  # Iterate over rows directly
        word = row['word']
        # Check if the word is a string
        if isinstance(word, str):
            emotions = row.drop('word').to_dict() #drop word, to iterate over emotions
            for synset in wordnet.synsets(word):
                for lemma in synset.lemmas():
                    lemma_name = lemma.name().replace("_", " ")  # Clean up lemma name
                    # Check if the lemma already exists (case-insensitive)
                    if lemma_name.lower() not in lexicon_df['word'].str.lower().values:
                      new_row = {'word': lemma_name}
                      new_row.update(emotions) # add all emotions columns
                      new_rows.append(new_row) #add to the new rows

    # Create a DataFrame from the new rows
    new_df = pd.DataFrame(new_rows)
    # Concatenate the original DataFrame with the new rows, ignore original index.
    expanded_lexicon_df = pd.concat([lexicon_df, new_df], ignore_index=True)
    expanded_lexicon_df = expanded_lexicon_df.drop_duplicates(subset=['word'], keep='first')
    return expanded_lexicon_df

# --- Main Program ---
# (Rest of your main program using the expanded lexicon)

# Load the expanded lexicon
expanded_lexicon_df = pd.read_csv("expanded_nrc_lexicon.csv")

text1 = "This is a wonderfully happy and joyful day!"
text2 = "I am feeling sad, angry, and filled with fear."
text3 = "The movie was okay.  It wasn't amazing, but not terrible."
text4 = "I can't believe I got the promotion, but now I have so much more responsibility."
text5 = "The movie was incredibly well-made, but it left me feeling empty inside."
text6 = "She said she loves me, but I don't know if I can trust her anymore."
text7 = "Winning the lottery was the best thing that ever happened to me, but now everyone wants a piece of my fortune."
text8 = "I finally finished the marathon, but my legs feel like they're going to fall off."
text9 = "The concert was amazing, but the crowd was overwhelming."
text10 = "I got an A on my exam, but I feel like I didn't really learn anything."
text11 = "He apologized for his mistake, but I still feel hurt."
text12 = "The new job pays well, but I miss my old colleagues."
text13 = "I love my new house, but the neighborhood is a bit too quiet for my liking."
text14 = "I am thrilled to announce that I got the job!"
text15 = "I am devastated by the loss of my pet."
text16 = "I am so proud of my daughter's achievements."
text17 = "I am furious about the unfair treatment I received."
text18 = "I am anxious about the upcoming exam."
text19 = "I am grateful for all the support from my friends."
text20 = "I am disgusted by the behavior I witnessed."
text21 = "I am excited to travel to a new country."
text22 = "I am relieved that the surgery went well."
text23 = "I am disappointed with the results of the project."

scores1 = analyze_sentiment(text1, expanded_lexicon_df)  # Use expanded lexicon
scores2 = analyze_sentiment(text2, expanded_lexicon_df)
scores3 = analyze_sentiment(text3, expanded_lexicon_df)
scores4 = analyze_sentiment(text4, expanded_lexicon_df)
scores5 = analyze_sentiment(text5, expanded_lexicon_df)
scores6 = analyze_sentiment(text6, expanded_lexicon_df)
scores7 = analyze_sentiment(text7, expanded_lexicon_df)
scores8 = analyze_sentiment(text8, expanded_lexicon_df)
scores9 = analyze_sentiment(text9, expanded_lexicon_df)
scores10 = analyze_sentiment(text10, expanded_lexicon_df)
scores11 = analyze_sentiment(text11, expanded_lexicon_df)
scores12 = analyze_sentiment(text12, expanded_lexicon_df)
scores13 = analyze_sentiment(text13, expanded_lexicon_df)
scores14 = analyze_sentiment(text14, expanded_lexicon_df)
scores15 = analyze_sentiment(text15, expanded_lexicon_df)
scores16 = analyze_sentiment(text16, expanded_lexicon_df)
scores17 = analyze_sentiment(text17, expanded_lexicon_df)
scores18 = analyze_sentiment(text18, expanded_lexicon_df)
scores19 = analyze_sentiment(text19, expanded_lexicon_df)
scores20 = analyze_sentiment(text20, expanded_lexicon_df)
scores21 = analyze_sentiment(text21, expanded_lexicon_df)
scores22 = analyze_sentiment(text22, expanded_lexicon_df)
scores23 = analyze_sentiment(text23, expanded_lexicon_df)
print(f"Text 1 Scores: {scores1}")
print(f"Text 2 Scores: {scores2}")
print(f"Text 3 Scores: {scores3}")
print(f"Text 4 Scores: {scores4}")
print(f"Text 5 Scores: {scores5}")
print(f"Text 6 Scores: {scores6}")
print(f"Text 7 Scores: {scores7}")
print(f"Text 8 Scores: {scores8}")
print(f"Text 9 Scores: {scores9}")
print(f"Text 10 Scores: {scores10}")
print(f"Text 11 Scores: {scores11}")
print(f"Text 12 Scores: {scores12}")
print(f"Text 13 Scores: {scores13}")
print(f"Text 14 Scores: {scores14}")
print(f"Text 15 Scores: {scores15}")
print(f"Text 16 Scores: {scores16}")
print(f"Text 17 Scores: {scores17}")
print(f"Text 18 Scores: {scores18}")
print(f"Text 19 Scores: {scores19}")
print(f"Text 20 Scores: {scores20}")
print(f"Text 21 Scores: {scores21}")
print(f"Text 22 Scores: {scores22}")
print(f"Text 23 Scores: {scores23}")

print(expanded_lexicon_df[expanded_lexicon_df['word'] == 'grief'])
print(expanded_lexicon_df[expanded_lexicon_df['word'] == 'sorrow'])

ModuleNotFoundError: No module named 'spacy'