In [1]:
# Import necessary libraries
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [19]:
import re  # Import the regex module for text cleaning

In [2]:
# Download the VADER lexicon (this only needs to be run once)
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/Shana/nltk_data...


True

In [3]:
# Define the TextBlob sentiment function
def textblob_sentiment(text, pos_threshold=0.1, neg_threshold=-0.1):
    """
    Analyze text sentiment using TextBlob.
    Returns "positive", "negative", or "neutral" based on polarity.
    """
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    if polarity > pos_threshold:
        return "positive"
    elif polarity < neg_threshold:
        return "negative"
    else:
        return "neutral"

# Initialize VADER's SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Define the VADER sentiment function
def vader_sentiment(text):
    """
    Analyze text sentiment using VADER.
    Returns "positive", "negative", or "neutral" using the compound score.
    """
    scores = sia.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        return "positive"
    elif compound <= -0.05:
        return "negative"
    else:
        return "neutral"


In [14]:
# --------------------------------------------------------------
# 1. Load Training and Test Data
# --------------------------------------------------------------

# Load the training data (assumes CSV is at ../data/training.csv)
training_data = pd.read_csv('../data/training.csv')
print("Training Data Preview:")
print(training_data.head())

# Load the test data (assumes CSV is at ../data/test.csv)
test_data = pd.read_csv('../data/test.csv')
print("\nTest Data Preview:")
print(test_data.head())

Training Data Preview:
   id                                           sentence     label
0   0  Those 2 drinks are part of the HK culture and ...  negative
1   1  I was told by the repair company that was doin...  negative
2   2             It is there to give them a good time .   neutral
3   3  Like leafing through an album of photos accomp...  negative
4   4         Johnny was a talker and liked to have fun.  positive

Test Data Preview:
   id                                           sentence
0   0  Found Thai Spoon on the Vegan Pittsburgh website.
1   1  Our bill came out to around $27 and we ate lik...
2   2  State Farm broke down the costs for me of the ...
3   3  The only con for this resto is the wait to get...
4   4  We could hear the people above us stomping aro...


In [22]:
# --------------------------------------------------------------
# 2. Apply Sentiment Analysis on the Training Data
# --------------------------------------------------------------
# Use the "sentence" column instead of "text"
training_data['textblob_sentiment'] = training_data['sentence'].apply(textblob_sentiment)
training_data['vader_sentiment'] = training_data['sentence'].apply(vader_sentiment)

print("\nTraining Data with Sentiment Analysis:")
print(training_data[['sentence', 'textblob_sentiment', 'vader_sentiment']].head())


Training Data with Sentiment Analysis:
                                            sentence textblob_sentiment  \
0  those 2 drinks are part of the hk culture and ...           negative   
1  i was told by the repair company that was doin...           negative   
2             it is there to give them a good time .           positive   
3  like leafing through an album of photos accomp...            neutral   
4         johnny was a talker and liked to have fun.           positive   

  vader_sentiment  
0        negative  
1         neutral  
2        positive  
3        positive  
4        positive  


In [20]:
training_data['sentence'] = training_data['sentence'].apply(clean_text)

In [23]:
# ---------------------------------------------------------------------
# 3. Calculate the Error (Accuracy) on the Training Data
# ---------------------------------------------------------------------
# Here we compare the sentiment predictions to the true 'label' column in your training data.
textblob_correct = (training_data['textblob_sentiment'] == training_data['label']).sum()
vader_correct = (training_data['vader_sentiment'] == training_data['label']).sum()
total_training = len(training_data)

textblob_accuracy = textblob_correct / total_training
vader_accuracy = vader_correct / total_training

print(f"\nTextBlob: {textblob_correct} out of {total_training} predictions are correct, accuracy: {textblob_accuracy:.2f}")
print(f"VADER: {vader_correct} out of {total_training} predictions are correct, accuracy: {vader_accuracy:.2f}")


TextBlob: 56896 out of 102097 predictions are correct, accuracy: 0.56
VADER: 59841 out of 102097 predictions are correct, accuracy: 0.59


In [9]:
# ------------------------------------------------------------------------------
# 4. Apply Sentiment Analysis on the Test Data and Create Submissions
# ------------------------------------------------------------------------------
# Use the "sentence" column from your test data to get predictions with both methods.
test_data['textblob_pred'] = test_data['sentence'].apply(textblob_sentiment)
test_data['vader_pred'] = test_data['sentence'].apply(vader_sentiment)

print("\nTest Data with Both Predicted Labels:")
print(test_data[['id', 'textblob_pred', 'vader_pred']].head())


Test Data with Both Predicted Labels:
   id textblob_pred vader_pred
0   0       neutral    neutral
1   1      positive   positive
2   2      negative   negative
3   3       neutral    neutral
4   4       neutral    neutral


In [10]:
# ------------------------------------------------------------------------------
# 5. Create and Save the Submission Files
# ------------------------------------------------------------------------------
# Create submission for TextBlob results
submission_textblob = test_data[['id']].copy()
submission_textblob['label'] = test_data['textblob_pred']
submission_textblob.to_csv("submission_textblob.csv", index=False)
print("\nSubmission file 'submission_textblob.csv' has been created:")
print(submission_textblob.head())

# Create submission for VADER results
submission_vader = test_data[['id']].copy()
submission_vader['label'] = test_data['vader_pred']
submission_vader.to_csv("submission_vader.csv", index=False)
print("\nSubmission file 'submission_vader.csv' has been created:")
print(submission_vader.head())


Submission file 'submission_textblob.csv' has been created:
   id     label
0   0   neutral
1   1  positive
2   2  negative
3   3   neutral
4   4   neutral

Submission file 'submission_vader.csv' has been created:
   id     label
0   0   neutral
1   1  positive
2   2  negative
3   3   neutral
4   4   neutral


In [17]:
def clean_text(text):
    """Clean input text: convert to lowercase and remove extra whitespace."""
    text = text.lower()                  # convert to lowercase
    text = re.sub(r'\s+', ' ', text)      # remove excessive spaces/newlines
    return text.strip()