# Final Project: Tweet Sentiment Analysis

In [1]:
import warnings
import pandas as pd

warnings.filterwarnings("ignore")

tweets_df = pd.read_csv("training.1600000.processed.noemoticon.csv", names= ['sentiment', 'id', 'time', 'flag', 'user', 'tweet'])
print(tweets_df.columns)
display(tweets_df.head())

Index(['sentiment', 'id', 'time', 'flag', 'user', 'tweet'], dtype='object')


Unnamed: 0,sentiment,id,time,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [2]:
tweets_df = tweets_df[['tweet', 'sentiment']]
MAP = {0: "NEG", 4: "POS"}
tweets_df["sentiment"] = tweets_df.sentiment.map(MAP)

print('The dataset consists of {} features & {} samples.'.format(tweets_df.shape[1], tweets_df.shape[0]))
print('\nThe dataset has {} missing values.\n'.format(tweets_df.isnull().any(axis = 1).sum()))
display(tweets_df.head())

The dataset consists of 2 features & 1600000 samples.

The dataset has 0 missing values.



Unnamed: 0,tweet,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",NEG
1,is upset that he can't update his Facebook by ...,NEG
2,@Kenichan I dived many times for the ball. Man...,NEG
3,my whole body feels itchy and like its on fire,NEG
4,"@nationwideclass no, it's not behaving at all....",NEG


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [4]:
import nltk
from nltk.corpus import stopwords
import regex as re
from tqdm import tqdm

nltk.download("stopwords")

def text_preprocessing(s):
    s = s.lower()
    s = re.sub(r"\'t", " not", s)
    s = re.sub(r'(@.*?)[\s]', ' ', s)
    s = re.sub(r'([\'\"\.\(\)\!\?\\\/\,])', r' \1 ', s)
    s = re.sub(r'[^\w\s\?]', ' ', s)
    s = re.sub(r'([\;\:\|•«\n])', ' ', s)
    s = " ".join([word for word in s.split()
                  if word not in stopwords.words('english')
                  or word in ['not', 'can']])
    s = re.sub(r'\s+', ' ', s).strip()

    return s

for index, tweet in enumerate(tqdm(tweets_df.iterrows(), total= 1600000)):
    tweets_df.at[index, 'tweet'] = text_preprocessing(tweets_df.at[index, 'tweet'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 1600000/1600000 [59:44<00:00, 446.32it/s] 


In [5]:
tweets_df['tweet_vector'] = tweets_df['tweet']

for index, tweet in enumerate(tqdm(tweets_df.iterrows(), total= 1600000)):
    tweets_df.at[index, 'tweet_vector'] = model.encode(tweets_df.at[index, 'tweet'])

display(tweets_df.head())

  0%|          | 637/1600000 [00:47<33:08:11, 13.41it/s]


KeyboardInterrupt: 

In [8]:
print(len(tweets_df.at[0, 'tweet_vector']))

768
