In [16]:
import pandas as pd
from Load_Dic import load_masterdictionary
from sklearn.metrics import f1_score, recall_score, precision_score

## Load Data

In [17]:
file_path = 'Data/data.csv'  
df = pd.read_csv(file_path)

In [18]:
df

Unnamed: 0,Sentence,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
3,According to the Finnish-Russian Chamber of Co...,neutral
4,The Swedish buyout firm has sold its remaining...,neutral
...,...,...
5837,RISING costs have forced packaging producer Hu...,negative
5838,Nordic Walking was first used as a summer trai...,neutral
5839,"According shipping company Viking Line , the E...",neutral
5840,"In the building and home improvement trade , s...",neutral


In [19]:
nan_counts = df.isna().sum()

print(nan_counts)

Sentence     0
Sentiment    0
dtype: int64


In [20]:
len(df)

5842

## Load Dictionary

In [22]:
dictionary_path = r'Data/Loughran-McDonald_MasterDictionary_1993-2023.csv'

In [23]:
master_dictionary, md_header, sentiment_categories, sentiment_dictionaries, stopwords, total_documents = load_masterdictionary(dictionary_path, True, None, True)



 ...Loading Master Dictionary 5,000

 ...Loading Master Dictionary 85,000
Master Dictionary loaded from file:
  Data/Loughran-McDonald_MasterDictionary_1993-2023.csv

  master_dictionary has 86,553 words.



## Classification of sentences

In [24]:
def classify_sentence(sentence, sentiment_dictionaries):
    words = sentence.upper().split()
    sentiment_scores = {'positive': 0, 'negative': 0}

    for word in words:
        if word in sentiment_dictionaries['positive']:
            sentiment_scores['positive'] += 1
        elif word in sentiment_dictionaries['negative']:
            sentiment_scores['negative'] += 1

    if sentiment_scores['positive'] > sentiment_scores['negative']:
        return 'positive'
    elif sentiment_scores['negative'] > sentiment_scores['positive']:
        return 'negative'
    else:
        return 'neutral'

In [25]:
df['predicted_sentiment'] = df['Sentence'].apply(lambda x: classify_sentence(x, sentiment_dictionaries))


In [26]:
accuracy = (df['predicted_sentiment'] == df['Sentiment']).mean()
print(f"Accuracy: {accuracy}")

Accuracy: 0.5619650804519001


In [27]:
# Compute F1 score and recall
f1 = f1_score(df['Sentiment'], df['predicted_sentiment'], average='weighted', labels=['positive', 'negative', 'neutral'])
recall = recall_score(df['Sentiment'], df['predicted_sentiment'], average='weighted', labels=['positive', 'negative', 'neutral'])
precision = precision_score(df['Sentiment'], df['predicted_sentiment'], average='weighted', labels=['positive', 'negative', 'neutral'])

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Precision: 0.587716029393037
Recall: 0.5619650804519001
F1 Score: 0.5237982522174053


In [28]:
# Sample data simulating the Loughran-McDonald dictionary structure
data = {
    'Word': ['GOOD', 'BAD', 'ENTHOUSIASTIC', 'DOUBT'],
    'Negative': [0, 1, 0, 0],
    'Positive': [1, 0, 1, 0],
    'Uncertainty': [0, 0, 0, 1]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Filter relevant columns
relevant_columns = ['Word', 'Negative', 'Positive', 'Uncertainty']
filtered_df = df[relevant_columns]



filtered_df

Unnamed: 0,Word,Negative,Positive,Uncertainty
0,GOOD,0,1,0
1,BAD,1,0,0
2,ENTHOUSIASTIC,0,1,0
3,DOUBT,0,0,1
