# Sentiment140 dataset

Dataset with 1.6 million tweets annotated and labelled with a positive and negative sentiment, negative = 0 & positive = 1 

In [25]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import emoji

In [26]:
data = pd.read_csv('Datasets/sentiment140.csv', header=None, names=['polarity', 'id', 'date', 'query', 'user', 'text'])
data.head()


Unnamed: 0,polarity,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [27]:
data = data.drop(['id', 'date', 'query', 'user'], axis=1)
data.head()

Unnamed: 0,polarity,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [28]:
# Check for the existence of emojis in the text

def contains_emoji(text):
    return any(char in emoji.EMOJI_DATA for char in text)

data_with_emojis = data[data['text'].apply(contains_emoji)]
data_with_emojis

Unnamed: 0,polarity,text
242372,0,"@AndrewDearling haha, well they are a mjor par..."
244527,0,Le planetarium de montreal: le show est nul et...
245409,0,"okeei, whay all the men has to be like this?? ..."
245432,0,@Kulnyte its ok. wish you were here though// ♥
246784,0,*♥* Twitter is so confuddlin me *♥*
...,...,...
1596602,4,@andymcnally so far so good &lt;&lt;hugs&gt;...
1596991,4,http://www.twitpic.com/7jka4 ♥Italian Stuffed ...
1597694,4,LVATT ♥♥♥ out in stores today! GO BUY IT
1598666,4,Enjoyin' a lazy day off and making/finishing p...


In [29]:
data['polarity'] = data['polarity'].map({0: 0, 4: 1})
data.head()

Unnamed: 0,polarity,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [31]:
# Function to clean tweets
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)  # Remove URLs
    tweet = re.sub(r'@[A-Za-z0-9_]+', '', tweet)  # Remove @ mentions
    tweet = re.sub(r'#[A-Za-z0-9_]+', '', tweet)  # Remove hashtags
    tweet = re.sub(r'[^A-Za-z\s]', '', tweet)  # Remove non-alphabetic characters
    tweet = tweet.lower().strip()  # Lowercase and remove leading/trailing spaces
    return tweet

data['text'] = data['text'].apply(clean_tweet)

Unnamed: 0,polarity,text
0,0,awww thats a bummer you shoulda got david car...
1,0,is upset that he cant update his facebook by t...
2,0,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire
4,0,no its not behaving at all im mad why am i her...
...,...,...
1599995,1,just woke up having no school is the best feel...
1599996,1,thewdbcom very cool to hear old walt interviews
1599997,1,are you ready for your mojo makeover ask me fo...
1599998,1,happy th birthday to my boo of alll time tupac...


In [32]:
data

Unnamed: 0,polarity,text
0,0,awww thats a bummer you shoulda got david car...
1,0,is upset that he cant update his facebook by t...
2,0,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire
4,0,no its not behaving at all im mad why am i her...
...,...,...
1599995,1,just woke up having no school is the best feel...
1599996,1,thewdbcom very cool to hear old walt interviews
1599997,1,are you ready for your mojo makeover ask me fo...
1599998,1,happy th birthday to my boo of alll time tupac...


In [33]:
data.to_csv('cleaned_sentiment140.csv', index=False)