# Project 

## Project Description



## Part 1: Data Retrieval & Preprocessing


In [42]:
# Import Libraries
import os
import json
import spacy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import emoji

import nltk
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer


from datasets import load_dataset
import transformers
import simpletransformers
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModel,
)
from sklearn.metrics import classification_report


### once you downloaded vader successfully you do not need to do this again.
### You can command it out in your personal copy as I did below to skip this.

# nltk.download('vader_lexicon', quiet=False)

# Define file paths
sentiment_train_file = "Datasets/Sentiment_Analysis/sentiment_analysis.csv"         
sentiment_test_file = "Datasets/Test Sets/sentiment-topic-test.tsv"

### Part 1.2 Sentiment Analysis Preprocessing

In [43]:
# Load the dataset
sentiment_train_df = pd.read_csv(sentiment_train_file)

# Rename the second column entries
sentiment_train_df.iloc[:, 1] = sentiment_train_df.iloc[:, 1].replace({
    'positive': 'POS',
    'neutral': 'NEU',
    'negative': 'NEG'
})

sentiment_train_df = sentiment_train_df.rename(columns={
    'sentiment': 'Sentiment',
    'text': 'Text'
})[['Text', 'Sentiment']]

# Remove rows where the 'text' column has more than 128 characters
sentiment_train_df = sentiment_train_df[sentiment_train_df.iloc[:, 0].str.len() <= 128]

# Save the updated training data back to the file
sentiment_train_df.to_csv(sentiment_train_file, index=False)


In [44]:
# Load the sentiment test file into a DataFrame
sentiment_test_df = pd.read_csv(sentiment_test_file, sep='\t')



# Rename and reorganize the columns
sentiment_test_df = sentiment_test_df.rename(columns={
    'sentence_id': 'ID',
    'topic': 'Relevant Topic',
    'sentiment': 'Sentiment',
    'sentence': 'Text'
})[['Text', 'Sentiment', 'Relevant Topic', 'ID']]

# Rename the third column entries
sentiment_test_df.iloc[:, 1] = sentiment_test_df.iloc[:, 1].replace({
    'positive': 'POS',
    'neutral': 'NEU',
    'negative': 'NEG'
})

sentiment_test_df.to_csv(sentiment_test_file, sep='\t', index=False)


## Part 2 Processing & Modeling

### 2.1: Sentiment Analysis

#### 2.1.1: Training and Testing using VADER

In [51]:

nlp = spacy.load('en_core_web_sm')
sia = SentimentIntensityAnalyzer()

def get_sentiment(vader_output):
    compound_score = vader_output['compound']

    if compound_score >= 0.05:
        return 'POS'
    elif compound_score <= -0.05:
        return 'NEG'
    else:
        return 'NEU'

def run_vader(textual_unit, 
              lemmatize=False, 
              parts_of_speech_to_consider=None,
              verbose=0):
    """
    Run VADER on a sentence from spacy
    
    :param str textual unit: a textual unit, e.g., sentence, sentences (one string)
    (by looping over doc.sents)
    :param bool lemmatize: If True, provide lemmas to VADER instead of words
    :param set parts_of_speech_to_consider:
    -None or empty set: all parts of speech are provided
    -non-empty set: only these parts of speech are considered.
    :param int verbose: if set to 1, information is printed
    about input and output
    
    :rtype: dict
    :return: vader output dict
    """
    doc = nlp(textual_unit)
        
    input_to_vader = []

    for sent in doc.sents:
        for token in sent:

            to_add = token.text

            if lemmatize:
                to_add = token.lemma_

                if to_add == '-PRON-': 
                    to_add = token.text

            if parts_of_speech_to_consider:
                if token.pos_ in parts_of_speech_to_consider:
                    input_to_vader.append(to_add) 
            else:
                input_to_vader.append(to_add)

    scores = sia.polarity_scores(' '.join(input_to_vader))
    
    if verbose >= 1:
        print()
        print('INPUT SENTENCE', sent)
        print('INPUT TO VADER', input_to_vader)
        print('VADER OUTPUT', scores)

    return scores


# sentiment_train_df['Sentiment'] = sentiment_train_df['Text'].apply(get_sentiment)

# sentiment_train_df.to_csv("Datasets/Sentiment_Analysis/vader_predicted.csv", index=False)

predicted_vader_sentiments = []
true_vader_sentiments = []

to_lemmatize = False 
pos = set()

for _, sent in sentiment_train_df.iterrows():
    tweet_info = sent['Text']
    true_sentiment = sent['Sentiment']
    vader_output = run_vader(tweet_info, lemmatize=to_lemmatize, verbose=0)
    vader_label = get_sentiment(vader_output)

    predicted_vader_sentiments.append(vader_label)
    true_vader_sentiments.append(true_sentiment)


#### Part 2.1.2 Testing using Huggingface Model

In [50]:
# Initialize the pre-trained Hugging Face sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", 
                              model="finiteautomata/bertweet-base-sentiment-analysis", 
                              tokenizer="finiteautomata/bertweet-base-sentiment-analysis")

# Initialize lists to store predicted sentiments and true sentiments
predicted_model_sentiments = []
true_model_sentiments = []

# Iterate over each row in the training set
for _, row in sentiment_train_df.iterrows():
    # Perform sentiment analysis on the text
    result = sentiment_pipeline(row['Text'])[0]
    predicted_model_sentiments.append(result['label'])
    
    # Save the true sentiment
    true_model_sentiments.append(row['Sentiment'])


## Part 3 Analysis of Results

### 3.1: Sentiment Analysis

In [52]:
# VADER Model Classification Report
VADER_Classification_Report = classification_report(true_vader_sentiments, predicted_vader_sentiments)
print("VADER Model Classification Report")
print("========================================")
print(VADER_Classification_Report)

VADER Model Classification Report
              precision    recall  f1-score   support

         NEG       0.69      0.57      0.62       122
         NEU       0.75      0.54      0.63       187
         POS       0.61      0.89      0.72       158

    accuracy                           0.67       467
   macro avg       0.68      0.67      0.66       467
weighted avg       0.68      0.67      0.66       467



In [48]:
# Huggingface Model Classification Report
Huggingface_Classification_Report = classification_report(true_model_sentiments, predicted_model_sentiments)
print("Huggingface Model Classification Report")
print("========================================")
print(Huggingface_Classification_Report)

Huggingface Model Classification Report
              precision    recall  f1-score   support

         NEG       0.83      0.83      0.83         6
         NEU       0.80      0.67      0.73         6
         POS       0.86      1.00      0.92         6

    accuracy                           0.83        18
   macro avg       0.83      0.83      0.83        18
weighted avg       0.83      0.83      0.83        18

