# Project 

## Project Description



## Part 1: Data Retrieval & Preprocessing


In [18]:
# Import Libraries
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# import emoji

import nltk
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer


from datasets import load_dataset
import transformers
import simpletransformers
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModel,
)
from sklearn.metrics import classification_report


### once you downloaded vader successfully you do not need to do this again.
### You can command it out in your personal copy as I did below to skip this.

# nltk.download('vader_lexicon', quiet=False)

# Define file paths
sentiment_train_file = "Datasets/Sentiment_Analysis/sentiment_analysis.csv"         
sentiment_test_file = "Datasets/Test Sets/sentiment-topic-test.tsv"

### Part 1.2 Sentiment Analysis Preprocessing

In [19]:
# Load the dataset
sentiment_train_df = pd.read_csv(sentiment_train_file)

# Rename the second column entries
sentiment_train_df.iloc[:, 1] = sentiment_train_df.iloc[:, 1].replace({
    'positive': 'POS',
    'neutral': 'NEU',
    'negative': 'NEG'
})
# Remove rows where the 'text' column has more than 128 characters
sentiment_train_df = sentiment_train_df[sentiment_train_df.iloc[:, 0].str.len() <= 128]

# Save the updated training data back to the file
sentiment_train_df.to_csv(sentiment_train_file, index=False)


(467, 2)

In [142]:
# Load the sentiment test file into a DataFrame
sentiment_test_df = pd.read_csv(sentiment_test_file, sep='\t')



# Rename and reorganize the columns
sentiment_test_df = sentiment_test_df.rename(columns={
    'sentence_id': 'ID',
    'topic': 'Relevant Topic',
    'sentiment': 'Sentiment',
    'sentence': 'Text'
})[['Text', 'Sentiment', 'Relevant Topic', 'ID']]

# Rename the third column entries
sentiment_test_df.iloc[:, 1] = sentiment_test_df.iloc[:, 1].replace({
    'positive': 'POS',
    'neutral': 'NEU',
    'negative': 'NEG'
})


## Part 2 Processing & Modeling

### Part 2.1: Sentiment Analysis using transformers

In [121]:
# Load the model
sentiment_analysis = pipeline("sentiment-analysis", 
                            model="finiteautomata/bertweet-base-sentiment-analysis", 
                            tokenizer="finiteautomata/bertweet-base-sentiment-analysis")

Device set to use cpu


In [None]:
# Initialize lists to store predicted sentiments and true sentiments
predicted_sentiments = []
true_sentiments = []

# Iterate over each row in the training set
for _, row in sentiment_test_df.iterrows():
    # Perform sentiment analysis on the text
    result = sentiment_analysis(row['Text'])[0]
    predicted_sentiments.append(result['label'])
    
    # Save the true sentiment
    true_sentiments.append(row['Sentiment'])

In [123]:
# Generate the classification report
Sentiment_Analysis_Report = classification_report(true_sentiments, predicted_sentiments)
print(Sentiment_Analysis_Report)

              precision    recall  f1-score   support

         NEG       0.83      0.83      0.83         6
         NEU       0.80      0.67      0.73         6
         POS       0.86      1.00      0.92         6

    accuracy                           0.83        18
   macro avg       0.83      0.83      0.83        18
weighted avg       0.83      0.83      0.83        18



### Part 1.3 Training and testing using VADER

In [None]:
sentiment_train_df.columns = sentiment_train_df.columns.str.strip()
print(sentiment_train_df.columns)

sia = SentimentIntensityAnalyzer()

def get_sentiment(text):
    scores = sia.polarity_scores(str(text))  
    compound_score = scores['compound']

    if compound_score >= 0.05:
        return 'POS'
    elif compound_score <= -0.05:
        return 'NEG'
    else:
        return 'NEU'

sentiment_train_df['predicted_sentiment'] = sentiment_train_df['Text'].apply(get_sentiment)

# sentiment_train_df.to_csv("Datasets/Sentiment_Analysis/vader_predicted.csv", index=False)

for sent in sentiment_train_df['Text']:
    scores = sia.polarity_scores(sent)  
    print()  
    print('INPUT SENTENCE:', sent)  
    print('VADER OUTPUT:', scores)  

Index(['ID', 'Relevant Topic', 'Sentiment', 'Text'], dtype='object')

INPUT SENTENCE: im getting on borderlands and i will murder you all ,
VADER OUTPUT: {'neg': 0.37, 'neu': 0.63, 'pos': 0.0, 'compound': -0.6908}

INPUT SENTENCE: So I spent a few hours making something for fun. . . If you don't know I am a HUGE @Borderlands fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg
VADER OUTPUT: {'neg': 0.042, 'neu': 0.627, 'pos': 0.33, 'compound': 0.9431}

INPUT SENTENCE: Rock-Hard La Varlope, RARE & POWERFUL, HANDSOME JACKPOT, Borderlands 3 (Xbox) dlvr.it/RMTrgF  
VADER OUTPUT: {'neg': 0.0, 'neu': 0.517, 'pos': 0.483, 'compound': 0.8159}

INPUT SENTENCE: that was the first borderlands session in a long time where i actually had a really satisfying combat experience. i got some really good kills
VADER OUTPUT: {'neg': 0.216, 'neu': 0.568, 'pos': 0.217, 'c

### Part 1.4 Testing using Huggingface

In [25]:
# Load the training and test data
sentiment_train_df = pd.read_csv(sentiment_train_file)
sentiment_test_df = pd.read_csv(sentiment_test_file, sep='\t')

# Strip any leading/trailing spaces from column names
sentiment_train_df.columns = sentiment_train_df.columns.str.strip()
sentiment_test_df.columns = sentiment_test_df.columns.str.strip()

# Check the column names to ensure the 'Text' column exists
print(sentiment_train_df.columns)
print(sentiment_test_df.columns)

# Initialize the pre-trained Hugging Face sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", 
                              model="finiteautomata/bertweet-base-sentiment-analysis", 
                              tokenizer="finiteautomata/bertweet-base-sentiment-analysis")

# Apply sentiment analysis to the training dataset
sentiment_train_df['predicted_sentiment'] = sentiment_train_df['Text'].apply(
    lambda x: sentiment_pipeline(x)[0]['label']
)

# Apply sentiment analysis to the test dataset
sentiment_test_df['predicted_sentiment'] = sentiment_test_df['Text'].apply(
    lambda x: sentiment_pipeline(x)[0]['label']
)

# Save the results with predictions
# sentiment_train_df.to_csv("sentiment_train_predictions.csv", index=False)
# sentiment_test_df.to_csv("sentiment_test_predictions.csv", index=False)

# Print some example predictions from the training data
print(sentiment_train_df[['Text', 'predicted_sentiment']].head())

# Print some example predictions from the test data
print(sentiment_test_df[['Text', 'predicted_sentiment']].head())

Index(['ID', 'Relevant Topic', 'Sentiment', 'Text'], dtype='object')
Index(['sentence_id', 'sentence', 'sentiment', 'topic'], dtype='object')


Device set to use cpu


KeyboardInterrupt: 