# 1. Initial Setup and Data Loading

In [55]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import re
import nltk

# Load the Twitter dataset

In [58]:
data = pd.read_csv("/Users/aadityashewale/Downloads/twitter_data.csv")


# Text Preprocessing Setup

In [61]:

nltk.download('stopwords') 
stemmer = nltk.SnowballStemmer("english")  
from nltk.corpus import stopwords
import string
stopword = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aadityashewale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Text Cleaning Function

In [64]:
def clean(text):
    # Convert to lowercase
    text = str(text).lower()
    
    # Remove square brackets and their contents
    text = re.sub('\\[.*?\\]', '', text)
    
    # Remove URLs
    text = re.sub('https?://\\S+|www\\.\\S+', '', text)
    
    # Remove HTML tags
    text = re.sub('<.*?>+', '', text)
    
    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    # Remove newlines
    text = re.sub('\\n', '', text)
    
    # Remove words containing numbers
    text = re.sub('\\w*\\d\\w*', '', text)
    
    # Remove stopwords
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    
    # Apply stemming
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    
    return text


# Apply cleaning function to tweets

In [67]:
data["tweet"] = data["tweet"].apply(clean)

# Sentiment Analysis

In [69]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')  # Download VADER lexicon
sentiments = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/aadityashewale/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Calculate sentiment scores for each tweet

In [71]:
data["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data["tweet"]]
data["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data["tweet"]]
data["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data["tweet"]]


# Sentiment Scoring Function

In [73]:
def sentiment_score(a, b, c):
    """
    Determine overall sentiment based on cumulative scores
    a: positive score
    b: negative score
    c: neutral score
    """
    if (a > b) and (a > c):
        print("Positive ðŸ˜Š ")
    elif (b > a) and (b > c):
        print("Negative ðŸ˜  ")
    else:
        print("Neutral ðŸ™‚ ")

# Calculate total sentiment scores

In [75]:
x = sum(data["Positive"])  # Total positive score
y = sum(data["Negative"])  # Total negative score
z = sum(data["Neutral"])   # Total neutral score

# Print final results

In [77]:
print("Positive: ", x)
print("Negative: ", y)
print("Neutral: ", z)

Positive:  2880.086
Negative:  7201.021
Neutral:  14696.888
