In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import emoji

In [46]:
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)  # Remove URLs
    tweet = re.sub(r'@[A-Za-z0-9_]+', '', tweet)  # Remove @ mentions
    tweet = re.sub(r'#[A-Za-z0-9_]+', '', tweet)  # Remove hashtags
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)  # Remove non-alphabetic characters
    tweet = tweet.lower().strip()  # Lowercase and remove leading/trailing spaces
    return tweet

def clean_reddit_comment(comment):
    comment = re.sub(r'http\S+', '', comment)  # Remove URLs
    comment = re.sub(r'(u\/|r\/)[A-Za-z0-9_]+', '', comment)  # Remove Reddit mentions
    comment = re.sub(r'#[A-Za-z0-9_]+', '', comment)  # Remove hashtags
    comment = re.sub(r'[^A-Za-z\s]', '', comment)  # Remove non-alphabetic characters
    comment = comment.lower().strip()  # Lowercase and remove leading/trailing spaces
    return comment

def clean_instagram_comment(comment):
    comment = re.sub(r'http\S+', '', comment)  # Remove URLs
    comment = re.sub(r'@[A-Za-z0-9_]+', '', comment)  # Remove @ mentions
    comment = re.sub(r'#[A-Za-z0-9_]+', '', comment)  # Remove hashtags
    comment = re.sub(r'[^A-Za-z\s]', '', comment)  # Remove non-alphabetic characters. currently also removing emojis, will change later
    comment = comment.replace('\n', ' ') # Replace newlines with spaces
    comment = comment.lower().strip()  # Lowercase and remove leading/trailing spaces
    return comment

# Sentiment140 dataset

Source: https://www.kaggle.com/datasets/kazanova/sentiment140

The dataset was automatically created by assuming any tweet with a positive/negative emoji/emoticon was regarded as a positive/negative tweet respectively.  
Dataset with 1.6 million tweets annotated and labelled with a positive and negative sentiment, positive = 4 & negative = 0

In [8]:
sentiment_140_data = pd.read_csv('Datasets/sentiment140.csv', header=None, names=['polarity', 'id', 'date', 'query', 'user', 'text'])
print(f"Shape of sentiment140 dataset: {sentiment_140_data.shape}")
sentiment_140_data.head()


Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [9]:
sentiment_140_data = sentiment_140_data.drop(['id', 'date', 'query', 'user'], axis=1)
sentiment_140_data['polarity'] = sentiment_140_data['polarity'].map({0: 0, 4: 1}) # Map 4 (positive) to 1 and 0 (negative) to 0, 1 will be positive and 0 will be negative
sentiment_140_data['source'] = 'Twitter'
sentiment_140_data.head()

Unnamed: 0,polarity,text,source
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",Twitter
1,0,is upset that he can't update his Facebook by ...,Twitter
2,0,@Kenichan I dived many times for the ball. Man...,Twitter
3,0,my whole body feels itchy and like its on fire,Twitter
4,0,"@nationwideclass no, it's not behaving at all....",Twitter


In [10]:
sentiment_140_data['text'] = sentiment_140_data['text'].apply(clean_tweet)
sentiment_140_data = sentiment_140_data[['text', 'polarity', 'source']]
sentiment_140_data.head()

Unnamed: 0,text,polarity,source
0,awww thats a bummer you shoulda got david car...,0,Twitter
1,is upset that he cant update his facebook by t...,0,Twitter
2,i dived many times for the ball managed to sav...,0,Twitter
3,my whole body feels itchy and like its on fire,0,Twitter
4,no its not behaving at all im mad why am i her...,0,Twitter


# Reddit Dataset

Source: https://www.kaggle.com/datasets/cosmos98/twitter-and-reddit-sentimental-analysis-dataset

Reddit dataset consisting of comments directed towards the general elections held in India 2019.  
The datasets consist of 36801 entries labelled with a positive and negative sentiment, positive = 1, negative = -1 and netural = 0.

In [22]:
reddit_data = pd.read_csv('Datasets/reddit.csv')
reddit_data = reddit_data.rename(columns={'clean_comment': 'text', 'category': 'polarity'})
print(f"Shape of reddit dataset: {reddit_data.shape}")
reddit_data.head()

Shape of reddit dataset: (37249, 2)


Unnamed: 0,text,polarity
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [23]:
reddit_data = reddit_data[reddit_data['polarity'] != 0]
print(f"Shape of reddit dataset after removing neutral entries: {reddit_data.shape}")
reddit_data.head()

Shape of reddit dataset after removing neutral entries: (24107, 2)


Unnamed: 0,text,polarity
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
4,for your own benefit you may want read living ...,1
5,you should all sit down together and watch the...,-1


In [24]:
reddit_data['polarity'] = reddit_data['polarity'].map({-1: 0, 1: 1})
reddit_data['source'] = 'Reddit'
reddit_data.head()

Unnamed: 0,text,polarity,source
0,family mormon have never tried explain them t...,1,Reddit
1,buddhism has very much lot compatible with chr...,1,Reddit
2,seriously don say thing first all they won get...,0,Reddit
4,for your own benefit you may want read living ...,1,Reddit
5,you should all sit down together and watch the...,0,Reddit


In [25]:
reddit_data['text'] = reddit_data['text'].apply(clean_reddit_comment) # Clean Reddit comments to ensure consistency even though author claimed it was cleaned beforehand
reddit_data.head()

Unnamed: 0,text,polarity,source
0,family mormon have never tried explain them th...,1,Reddit
1,buddhism has very much lot compatible with chr...,1,Reddit
2,seriously don say thing first all they won get...,0,Reddit
4,for your own benefit you may want read living ...,1,Reddit
5,you should all sit down together and watch the...,0,Reddit


# Instagram dataset

Source: https://zenodo.org/records/13896353

Instagram dataset consisting of comments related to COVID-19 outbreak from Jan 2020 to Sep 2024, the dataset was generated with the help of VADER and twitter-xlm-roberta-base-sentiment, classifiying with polarity ranging from positive, negative and neutral.

In [65]:
instagram_data = pd.read_excel('Datasets/instagram.xlsx')
print(f"Shape of Instagram dataset: {instagram_data.shape}")
instagram_data.head()

Shape of Instagram dataset: (500153, 6)


Unnamed: 0,PostID,Post Description,Date,Language Code,Full Language,Sentiment
0,B7mbLCVhYIf,QUESTIONS AND ANSWERS ON CORONAVIRUS PT. 2\n\n...,01/21/2020,en,English,positive
1,B7o2ozppOnS,#Viral China no deja salir a #11millones de ci...,01/22/2020,es,Spanish,negative
2,B7nMy1Xl7st,Önce tedbir sonra tevekkül inşallah. Wuhanda o...,01/22/2020,tr,Turkish,negative
3,B7m7M3SgvI1,Using humor to bring attention to a serious ma...,01/22/2020,en,English,negative
4,B7oK_DMhtxr,Stay safe\nCover your face\n#typ262 #40mm #40m...,01/22/2020,en,English,positive


In [67]:
instagram_data = instagram_data[instagram_data['Full Language'] == 'English']
instagram_data = instagram_data[instagram_data['Sentiment'] != 'neutral']
instagram_data = instagram_data.drop(['PostID','Date','Language Code','Full Language'], axis=1)
print(f"Shape of Instagram dataset after adjustments: {instagram_data.shape}")
instagram_data.head()

Shape of Instagram dataset after adjustments: (221346, 3)


Unnamed: 0,Post Description,Sentiment,has_emoji
0,QUESTIONS AND ANSWERS ON CORONAVIRUS PT. 2\n\n...,positive,False
3,Using humor to bring attention to a serious ma...,negative,True
4,Stay safe\nCover your face\n#typ262 #40mm #40m...,positive,False
5,Did you know an outbreak of a new coronavirus ...,negative,False
7,It’s so sad and scary that the coronavirus has...,negative,True


In [68]:
instagram_data = instagram_data.rename(columns={'Sentiment': 'polarity', 'Post Description': 'text'})
instagram_data['polarity'] = instagram_data['polarity'].map({'negative': 0, 'positive': 1})
instagram_data['source'] = 'Instagram'
instagram_data.head()

Unnamed: 0,text,polarity,has_emoji,source
0,QUESTIONS AND ANSWERS ON CORONAVIRUS PT. 2\n\n...,1,False,Instagram
3,Using humor to bring attention to a serious ma...,0,True,Instagram
4,Stay safe\nCover your face\n#typ262 #40mm #40m...,1,False,Instagram
5,Did you know an outbreak of a new coronavirus ...,0,False,Instagram
7,It’s so sad and scary that the coronavirus has...,0,True,Instagram


In [69]:
instagram_data['text'] = instagram_data['text'].apply(clean_instagram_comment)
instagram_data.head()

Unnamed: 0,text,polarity,has_emoji,source
0,questions and answers on coronavirus pt is t...,1,False,Instagram
3,using humor to bring attention to a serious ma...,0,True,Instagram
4,stay safe cover your face,1,False,Instagram
5,did you know an outbreak of a new coronavirus ...,0,False,Instagram
7,its so sad and scary that the coronavirus has ...,0,True,Instagram


In [48]:
data_combined = pd.concat([sentiment_140_data, reddit_data, instagram_data], ignore_index=True)
print(f"Shape of combined dataset: {data_combined.shape}")
data_combined

Shape of combined dataset: (1845453, 3)


Unnamed: 0,text,polarity,source
0,awww thats a bummer you shoulda got david car...,0,Twitter
1,is upset that he cant update his facebook by t...,0,Twitter
2,i dived many times for the ball managed to sav...,0,Twitter
3,my whole body feels itchy and like its on fire,0,Twitter
4,no its not behaving at all im mad why am i her...,0,Twitter
...,...,...,...
1845448,joanna loeber and toms yan for representing ou...,1,Instagram
1845449,joanna loeber and toms yan for representing ou...,1,Instagram
1845450,joanna loeber and toms yan representing our te...,1,Instagram
1845451,hang in there i know that its hard to even i...,1,Instagram


In [52]:
data_combined.to_csv('Datasets/combined_dataset.csv', encoding='utf-8', index=False)

# Data Analysis

In [55]:
def contains_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)