In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import re
import nltk

# Read data from Excel Sheet

In [3]:
df_obama = pd.read_excel('training-Obama-Romney-tweets.xlsx', sheet_name='Obama')
df_romney = pd.read_excel('training-Obama-Romney-tweets.xlsx', sheet_name='Romney')

In [4]:
df_obama.head()

Unnamed: 0.1,Unnamed: 0,date,time,Anootated tweet,Unnamed: 4,Unnamed: 5
0,,,,"1: positive, -1: negative, 0: neutral, 2: mixed",Class,Your class
1,,2012-10-16 00:00:00,10:28:53-05:00,"Kirkpatrick, who wore a baseball cap embroider...",0,
2,,2016-12-10 00:00:00,10:09:00-05:00,Question: If <e>Romney</e> and <e>Obama</e> ha...,2,
3,,2012-10-16 00:00:00,10:04:30-05:00,#<e>obama</e> debates that Cracker Ass Cracker...,1,
4,,2012-10-16 00:00:00,10:00:36-05:00,RT @davewiner Slate: Blame <e>Obama</e> for fo...,2,


In [74]:
df_romney.head()

Unnamed: 0.1,Unnamed: 0,date,time,Anootated tweet,Unnamed: 4,Unnamed: 5
0,,,,"1: positive, -1: negative, 0: neutral, 2: mixed",Class,Your class label
1,,2012-10-16 00:00:00,09:38:08-05:00,Insidious!<e>Mitt Romney</e>'s Bain Helped Phi...,-1,
2,,2012-10-16 00:00:00,10:22:34-05:00,Senior <e>Romney</e> Advisor Claims <e>Obama</...,2,
3,,2012-10-16 00:00:00,10:14:18-05:00,.@WardBrenda @shortwave8669 @allanbourdius you...,-1,
4,,2012-10-16 00:00:00,09:27:16-05:00,<e>Mitt Romney</e> still doesn't <a>believe</a...,-1,


# Data Cleaning

In [5]:
def clean_data(df):
    df = df.drop(['Unnamed: 0', 'Unnamed: 5'], axis=1)
    df = df.rename(columns = {'Unnamed: 4': 'class', 'Anootated tweet': 'tweets'})
    df = df.dropna()
    df = df[~df['class'].isin(['irrevelant', 'irrelevant', 'IR', 'Class', '!!!!', 2])]
    df['class'] = df['class'].astype(int)
    df = df.drop(df[df['class'] == 2].index)
    df = df.reset_index(drop=True)

    return df

df_obama = clean_data(df_obama)
df_romney = clean_data(df_romney)

def clean_tweets(text):
    text = text.lower()
    text = re.sub('<[^>]+>', '', text) # remove HTML tags
    text = re.sub('@[^\s]+','',text) # remove usernames
    text = re.sub('[<>!@$#:.,%\?-]+', '', text) # remove punctuation and special characters
    text = re.sub('http\S+', '', text) # remove URLs
    # text = re.sub('#\w+', '', text) # remove hashtags

    return text

df_obama['tweets'] = df_obama['tweets'].apply(clean_tweets)
df_romney['tweets'] = df_romney['tweets'].apply(clean_tweets)

In [81]:
df_obama.head()

Unnamed: 0,date,time,tweets,class
0,2012-10-16 00:00:00,10:28:53-05:00,kirkpatrick who wore a baseball cap embroidere...,0
1,2012-10-16 00:00:00,10:04:30-05:00,obama debates that cracker ass cracker tonight...,1
2,2012-10-16 00:00:00,09:50:08-05:00,youre missing the point im afraid you do n...,0
3,2012-10-16 00:00:00,10:00:16-05:00,i was raised as a democrat left the party yea...,-1
4,2012-10-16 00:00:00,09:48:07-05:00,the obama camp can't afford to lower expectati...,0


In [82]:
df_romney.head()

Unnamed: 0,date,time,tweets,class
0,2012-10-16 00:00:00,09:38:08-05:00,insidiousmitt romney's bain helped philip morr...,-1
1,2012-10-16 00:00:00,10:14:18-05:00,you mean like romney cheated in primary,-1
2,2012-10-16 00:00:00,09:27:16-05:00,mitt romney still doesn't believe that we have...,-1
3,2012-10-16 00:00:00,10:11:43-05:00,romney's tax plan deserves a 2nd look because ...,-1
4,2012-10-16 00:00:00,10:13:17-05:00,hope romney debate prepped w/ the same people ...,1


In [13]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
temp = df_obama.head(100)

In [15]:
mapper = {'negative': -1, 'neutral': 0, 'positive': 1}

In [16]:
df_obama['sentiment'] = df_obama['tweets'].apply(lambda x: mapper[classifier(x)[0]['label']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['sentiment'] = temp['tweets'].apply(lambda x: mapper[classifier(x)[0]['label']])


In [17]:
temp.head(20)

Unnamed: 0,date,time,tweets,class,sentiment
0,2012-10-16 00:00:00,10:28:53-05:00,kirkpatrick who wore a baseball cap embroidere...,0,0
1,2012-10-16 00:00:00,10:04:30-05:00,obama debates that cracker ass cracker tonight...,1,0
2,2012-10-16 00:00:00,09:50:08-05:00,youre missing the point im afraid you do n...,0,-1
3,2012-10-16 00:00:00,10:00:16-05:00,i was raised as a democrat left the party yea...,-1,-1
4,2012-10-16 00:00:00,09:48:07-05:00,the obama camp can't afford to lower expectati...,0,0
5,2012-10-16 00:00:00,10:12:50-05:00,obama pot policy disappointing to say the lea...,-1,-1
6,2012-10-16 00:00:00,10:12:11-05:00,not all of hollywood has his back rt gene sim...,-1,-1
7,2012-10-16 00:00:00,09:22:47-05:00,obama's expedient speak fair in order to slend...,0,0
8,2012-10-16 00:00:00,10:02:09-05:00,i had a dream that i was smoking with obama oo,0,0
9,2012-10-16 00:00:00,10:02:57-05:00,the washington times the president’s popularit...,-1,0


In [18]:
temp[temp['class'] == temp['sentiment']].shape

(64, 5)

In [69]:
classifier('bad')[0]['label']

'negative'