# Libraries

In [55]:
import pandas as pd
from nrclex import NRCLex
import numpy as np
import nltk
import textblob
from textblob import TextBlob
import stopwords
from nltk.corpus import stopwords
from afinn import Afinn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# nltk.download('punkt')
from textblob.classifiers import NaiveBayesClassifier


In [53]:
afinn = Afinn()

texts = [
    "I absolutely loved the new movie! The acting was superb, and the storyline kept me engaged from start to finish",
    "This product is terrible and a waste of money. I'm very disappointed.",
    "The new google phone is okay, not great but not bad either.",
    "New google buds are a no go. It's a complete failure."
]

def categorize_sentiment(score):
    if score > 5:
        return "Joy"
    elif score > 0:
        return "Positive"
    elif score == 0:
        return "Neutral"
    elif score < -5:
        return "Anger"
    else:
        return "Negative"
results = []
for text in texts:
    score = afinn.score(text)
    category = categorize_sentiment(score)
    results.append([text, score, category])

headers = ["Text", "Sentiment Score", "Sentiment Category"]
table = tabulate(results, headers, tablefmt="grid")

# Print the table
print(table)


+-----------------------------------------------------------------------------------------------------------------+-------------------+----------------------+
| Text                                                                                                            |   Sentiment Score | Sentiment Category   |
| I absolutely loved the new movie! The acting was superb, and the storyline kept me engaged from start to finish |                 8 | Joy                  |
+-----------------------------------------------------------------------------------------------------------------+-------------------+----------------------+
| This product is terrible and a waste of money. I'm very disappointed.                                           |                -6 | Anger                |
+-----------------------------------------------------------------------------------------------------------------+-------------------+----------------------+
| The new google phone is okay, not great but 

In [None]:
file = "new_df.csv"
df = pd.read_csv(file, sep=';', encoding="latin1")
# display(df)

# NLP Emotions Lexicon-based

## NRCLEX


In [None]:
print(stopwords.words('english'))

In [None]:
df['emotions_NRCLex'] = df['comment'].apply(lambda x: NRCLex(x).affect_frequencies) 
display(df)

In [None]:
df = pd.concat([df.drop(['emotions_NRCLex'], axis=1), df['emotions_NRCLex'].apply(pd.Series)], axis=1)
df.head()
# df.to_csv("NrclexLexicon.csv", sep=';')


In [None]:
df.to_csv("NrclexLexicon.csv", sep=';')

In [None]:

# def get_sentiment(sentiment):
#     # Check if sentiment is a dictionary
#     if isinstance(sentiment, dict):
#         if sentiment.get('positive', 0) > sentiment.get('negative', 0):
#             return 'Positive'
#         elif sentiment.get('positive', 0) < sentiment.get('negative', 0):
#             return 'Negative'
#         else:
#             return 'Neutral'
   
    
    
# df['sentiment'] = df['comment'].apply(get_sentiment)
# display(df)
# df.to_csv("NrclexLexicon.csv", sep=';')

## TextBlob

In [None]:

# Function to get the polarity and subjectivity
def analyze_sentiment(comment):
    blob = TextBlob(comment)
    return pd.Series([blob.sentiment.polarity, blob.sentiment.subjectivity])

# Function to label the sentiment based on polarity
def label_sentiment(polarity):
    if polarity > 0:
        return "Positive"
    elif polarity < 0:
        return "Negative"
    else:
        return "Neutral"

# Apply the function to the comments column
df[['polarity', 'subjectivity']] = df['comment'].apply(analyze_sentiment)

# Apply the labeling function to the polarity column
df['sentiment'] = df['polarity'].apply(label_sentiment)
display(df)
df.to_csv('TextBlobLexicon.csv', ';')

In [None]:
# comments = np.array(df['comment'])
# test_comments = comments[25000:]
# sample_comments_ids = [1485, 7778, 12397,500, 200]

In [None]:
# Iterate over the sample comments and calculate their sentiment polarity
# for sample_id in sample_comments_ids:
#     comment = test_comments[sample_id]
#     print('Comment ID:', sample_id)
#     print("Comment:", comment)
#     print('Predicted sentiment polarity:', textblob.TextBlob(comment).sentiment.polarity)
#     print('*'*50)


In [None]:
# sentiment_polarity = [textblob.TextBlob(comment).sentiment.polarity for comment in test_comments]
# predicted_sentiment = ['pozitiv' if score> 0.01 else 'negativ' for score in sentiment_polarity]

In [None]:

# df['sentiment'] = df['comment'].apply(lambda x: TextBlob(str(x)).sentiment.polarity if pd.notnull(x) else None)
# df_sorted_descending= df.sort_values(by="sentiment", ascending=False)
# df.to_csv("TextBlobLexicon.csv", sep=';')
# display(df_sorted_descending)


## AFININ LEXICON ANALYSIS

In [None]:

afn = Afinn()
positive_threshold = 0.1
negative_threshold = -0.1


def calculate_sentiment(comment):
    words = comment.split()
    scores = [afn.score(word) for word in words if afn.score(word) != 0]
    if scores:
        return sum(scores) / len(scores)
    else:
        return 0
df['sentiment_score'] = df['comment'].apply(calculate_sentiment)

df['sentiment_score'] = df['sentiment_score'].apply(lambda x: '{:,.2f}'.format(x) if x != 0 else '0')

def label_sentiment(score):
    if score > positive_threshold:
        return 'Positive'
    elif score < negative_threshold:
        return 'Negative'
    else:
        return 'Neutral'


df['sentiment_label'] = df['sentiment_score'].apply(lambda x: label_sentiment(float(x.replace(',', ''))))


output_file_with_labels = 'AfininLexicon.csv'
df.to_csv(output_file_with_labels, index=False, sep=';')

print("Sentiment-labeled comments saved to:", output_file_with_labels)
display(df)

## Vader

In [None]:

sid_obj = SentimentIntensityAnalyzer()

def sentiment_scores(sentence):
    sentiment_dict = sid_obj.polarity_scores(sentence)
    return sentiment_dict

df['sentiment_dict'] = df['comment'].apply(sentiment_scores)
df['neg'] = df['sentiment_dict'].apply(lambda x: x['neg'])
df['neu'] = df['sentiment_dict'].apply(lambda x: x['neu'])
df['pos'] = df['sentiment_dict'].apply(lambda x: x['pos'])
df['compound'] = df['sentiment_dict'].apply(lambda x: x['compound'])

df = df.drop(columns=['sentiment_dict'])


def overall_sentiment(compound):
    if compound >= 0.05:
        return 'Positive'
    elif compound <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

df['sentiment'] = df['compound'].apply(overall_sentiment)

df.to_csv('VaderLexicon.csv', index=False, sep=';')
display(df)

## Comparare indici de performanta 

In [None]:
import pandas as pd

df = pd.read_csv("lexicon_sentiments_labeld.csv", delimiter=";")

def calculate_accuracy(lexicon_column):
    correct_predictions = (df['personal_sentiment'] == df[lexicon_column]).sum()
    total_predictions = len(df)
    accuracy = correct_predictions / total_predictions
    return accuracy


lexicon_columns = ['TextBlob_sentiment', 'Vader_sentiment', 'Afinin_Sentiment', 'NRCLex_sentiment']


accuracy_scores = {}
for lexicon_column in lexicon_columns:
    accuracy_scores[lexicon_column] = calculate_accuracy(lexicon_column)


print("Accuracy scores:")
for lexicon_column, accuracy in accuracy_scores.items():
    print(f"{lexicon_column}: {accuracy:.2%}")


best_lexicon = max(accuracy_scores, key=accuracy_scores.get)
print(f"\nThe best performing lexicon is: {best_lexicon}")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

pastel_palette = sns.color_palette("pastel", len(lexicon_columns))

plt.figure(figsize=(10, 6))
bars = plt.bar(accuracy_scores.keys(), accuracy_scores.values(), color=pastel_palette)
plt.title('Accuracy of Lexicon Sentiment Analysis')
plt.xlabel('Lexicon')
plt.ylabel('Accuracy')
plt.ylim(0, 1) 
plt.xticks(rotation=0)


for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.01, "{:.1%}".format(yval), ha='center', va='bottom')

plt.tight_layout()
plt.show()


best_lexicon = max(accuracy_scores, key=accuracy_scores.get)


In [None]:
from sklearn.metrics import f1_score

# Define classes
classes = ['Positive', 'Negative', 'Neutral']

# Calculate F1-score for each lexicon and sentiment class
f1_scores = {}
for lexicon_column in lexicon_columns:
    lexicon_predictions = df[lexicon_column]
    f1_scores[lexicon_column] = {}
    for sentiment_class in classes:
        true_labels = df['personal_sentiment'] == sentiment_class
        predicted_labels = lexicon_predictions == sentiment_class
        f1_scores[lexicon_column][sentiment_class] = f1_score(true_labels, predicted_labels)

# Print F1-scores
print("F1-scores:")
for lexicon_column, scores in f1_scores.items():
    print(f"{lexicon_column}:")
    for sentiment_class, f1 in scores.items():
        print(f"\t{sentiment_class}: {f1:.2f}")

# Optionally, calculate weighted average F1-score
weighted_f1_scores = {}
for lexicon_column, scores in f1_scores.items():
    weighted_f1_scores[lexicon_column] = sum(scores.values()) / len(scores)

print("\nWeighted average F1-scores:")
for lexicon_column, f1 in weighted_f1_scores.items():
    print(f"{lexicon_column}: {f1:.2f}")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Load your dataset
df = pd.read_csv("new_df.csv", delimiter=";")

# Define lexicon columns and true sentiment labels
lexicon_columns = ['TextBlob_sentiment', 'Vader_sentiment', 'Afinin_Sentiment', 'NRCLex_sentiment']
true_labels = df['personal_sentiment']

# Create confusion matrix for each lexicon
for lexicon_column in lexicon_columns:
    predicted_labels = df[lexicon_column]
    cm = confusion_matrix(true_labels, predicted_labels, labels=["Positive", "Negative", "Neutral"])
    
    # Visualize confusion matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Positive", "Negative", "Neutral"], yticklabels=["Positive", "Negative", "Neutral"])
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title(f'Confusion Matrix for {lexicon_column}')
    plt.show()


# MACHINE LEARNING BAYIES