In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import gensim

# 下載必要的資源
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# pos_tag change
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def load_sentiment_words(sentiment_dic):
    f = pd.ExcelFile(sentiment_dic)
    df = f.parse()
    positive_words = set(df.iloc[:, 1].dropna().astype(str))
    negative_words = set(df.iloc[:, 0].dropna().astype(str))
    return positive_words, negative_words
    


In [3]:
def load_sentiment_words_2(sentiment_dic):
    f = pd.ExcelFile(sentiment_dic)
    df = f.parse()

    df.iloc[:, 1] = df.iloc[:, 1].dropna()  
    df.iloc[:, 0] = df.iloc[:, 0].dropna()

    df.iloc[:, 1] = df.iloc[:, 1].astype(str)  
    df.iloc[:, 0] = df.iloc[:, 0].astype(str)

    df.iloc[:, 1] = df.iloc[:, 1].apply(lambda x: x.lower()) 
    df.iloc[:, 0] = df.iloc[:, 0].apply(lambda x: x.lower()) 

    positive_words = set(df.iloc[:, 1])
    negative_words = set(df.iloc[:, 0])
    return positive_words, negative_words
    

#### **1. `gensim.utils.simple_preprocess()`**

- **優點**：
  - 自動將轉換小寫。
  - 去除標點符號和非字母字符（ `deacc=True`）。
  - 高效，適合處理大規模文本。

- **缺點**：
  - 不保留數字和標點符號。
  - 不會自動去除停用詞。
  - 靈活性有限。

#### **2. `nltk.tokenize.word_tokenize()`**

- **優點**：
  - 能夠保留標點符號、數字等字符。
  - 適合需要進一步自定義處理的情況。
  - 支援多語言文本。

- **缺點**：
  - 僅做分詞，不進行其他預處理 (如轉換小寫或去除標點)。

- **適用場景**：
  - 需要更精細的分詞操作，尤其是保留標點符號、數字或在多語言環境下進行處理時。



In [4]:
stop_words = set(stopwords.words('english')) # list -> set
print(type(stop_words))
print(stop_words)

<class 'set'>
{"didn't", 'weren', 'both', 'no', 'over', "she's", 'hasn', "shouldn't", 'doesn', "mustn't", 'for', 'him', 'before', "hadn't", "hasn't", 'such', 'having', 'below', 'if', 'were', 'on', 'there', 'those', 'wouldn', 'other', 'o', 'll', 'some', 'me', 'just', "needn't", 'shouldn', 'it', "you're", 'and', "you'd", 'does', 'hers', 'off', 'again', 'couldn', 'too', 'her', 'against', 'while', "should've", "isn't", 'won', 'under', 'what', 'herself', 'he', "you'll", 'm', 'will', 'aren', 'wasn', 'an', 'between', 'out', 'all', 'most', 'is', 'our', 'my', 're', 'yourselves', 'the', 've', 'yourself', 'to', 'these', 'further', 'didn', 'only', "won't", 'because', 'themselves', "it's", 'at', 'had', "couldn't", 'where', 'why', 'can', 'of', 'am', 'by', 'above', 'd', 'or', 'doing', 'which', 'who', 'each', 'ourselves', "weren't", 'about', 'once', 'are', 'as', "you've", 'their', 'until', 'how', 'haven', 'own', 'up', 't', 'needn', 'same', 'be', 'did', 'have', 'any', 'mustn', 'has', "wouldn't", 'mysel

In [16]:
lemmatizer = WordNetLemmatizer()
def lemmatize_sentence(sentence): # input str
    words = gensim.utils.simple_preprocess(str(sentence), deacc=True) # list of strings
    lemmatized_words = []
    for word in words:
        if word not in stop_words: # exclude stopwords
            word, word_class = nltk.pos_tag([word])[0] # return [("word", "word class")]
            wordnet_pos = get_wordnet_pos(word_class)
            lemmatized_words.append(lemmatizer.lemmatize(word, pos=wordnet_pos)) # return lemmatized word
    return lemmatized_words


def determine_sentiment(sentence, positive_words, negative_words): # input str
    lemmatized_words = lemmatize_sentence(sentence) # return list of strings
    positive_count = sum(1 for word in lemmatized_words if word in positive_words) # sum generator
    negative_count = sum(1 for word in lemmatized_words if word in negative_words)

    if negative_count > positive_count:
        return 'Negative', positive_count, negative_count
    elif positive_count > negative_count:
        return 'Positive', positive_count, negative_count
    else:
        return 'Neutral', positive_count, negative_count


In [6]:
# Test
test_sentence = "Elon Musk's daily $1 million giveaway to registered voters could be illegal, experts say"
result = lemmatize_sentence(test_sentence)
print(result,'\n',type(result))

['elon', 'musk', 'daily', 'million', 'giveaway', 'register', 'voter', 'could', 'illegal', 'expert', 'say'] 
 <class 'list'>


In [9]:
sentiment_dic_path = r"C:\Users\user\Desktop\Python\Thesis\NLP_update\LM字典情緒詞庫.xlsx"

# original function
positive_set, negative_set = load_sentiment_words(sentiment_dic_path)
print(positive_set, negative_set)

# new function
positive_set_new, negative_set_new = load_sentiment_words_2(sentiment_dic_path)
print(positive_set_new, negative_set_new)



In [10]:
import re
words_lower = []
for word in positive_set:
    if re.search(r'[a-z]', word): # if match, return match object
        words_lower.append(word)
print(words_lower)

words_upper = []
for word in positive_set_new:
    if re.search(r'[A-Z]', word): # if match, return match object
        words_upper.append(word)
print(words_upper)

[]
[]


In [17]:
sentence = "HAPPINESS".lower()
sentiment, pos_num, neg_num = determine_sentiment(sentence, positive_set_new, negative_set_new)
print(sentiment, pos_num, neg_num)

Positive 1 0


In [18]:
def analyze_csv(input_file, output_file, positive_words, negative_words):
    try:
        df = pd.read_csv(input_file)
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
    df['LM label'] = df['sentence'].apply(lambda x: determine_sentiment(x, positive_words, negative_words))

    try:
        df.to_csv(output_file, index=False)
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    try:
        df = pd.read_csv(output_file)
        print(df.head(5))
    except Exception as e:
        print({e})


In [19]:
input_file = r"C:\Users\user\Desktop\Python\Thesis\NLP_update\1.csv"
output_file = r"LM labeled file.csv"

analyze_csv(input_file, output_file, positive_set_new, negative_set_new)


                                            sentence  label  \
0  $2.26 Billion Growth in Global Smart Waste Man...      2   
1  NEW YORK, March 25, 2021 /PRNewswire/ -- Techn...      0   
2  The market is segmented by application (collec...      0   
3  The report offers an in-depth analysis of rece...      0   
4                   Download Sample Report Instantly      0   

             LM label  
0   ('Neutral', 0, 0)  
1  ('Negative', 0, 1)  
2   ('Neutral', 0, 0)  
3   ('Neutral', 0, 0)  
4   ('Neutral', 0, 0)  
