In [1]:
import pandas as pd
import re

In [2]:
tsad_file_path = "train.csv" 
tsad_df = pd.read_csv(tsad_file_path, usecols=['textID', 'text', 'sentiment'], encoding="latin1")

In [3]:
tsad_df = tsad_df.dropna(subset=['text'])

In [4]:
tsad_df['sentiment'].value_counts()

sentiment
neutral     11117
positive     8582
negative     7781
Name: count, dtype: int64

In [5]:
pos_n = int(50001/3 - (tsad_df['sentiment'] == "positive").sum())
neg_n = int(50001/3 - (tsad_df['sentiment'] == "negative").sum())
neu_n = int(50001/3 - (tsad_df['sentiment'] == "neutral").sum())

In [6]:
print(pos_n, neg_n, neu_n)

8085 8886 5550


In [7]:
additional_path = "training.1600000.processed.noemoticon.csv"
additional_df = pd.read_csv(additional_path, encoding="latin1")

In [8]:
additional_df.columns = additional_df.columns.str.strip()

In [9]:
additional_df.head()

Unnamed: 0,polarity of tweet,id of the tweet,date of the tweet,query,user,text of the tweet
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [10]:
duplicate_text_count = additional_df['text of the tweet'].duplicated().sum()
print(f"Number of duplicate text values: {duplicate_text_count}")

additional_df = additional_df.drop_duplicates(subset=['text of the tweet'], keep='first')

Number of duplicate text values: 12440


In [11]:
tsad_df.head()

Unnamed: 0,textID,text,sentiment
0,cb774db0d1,"I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,negative
2,088c60f138,my boss is bullying me...,negative
3,9642c003ef,what interview! leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",negative


In [12]:
df = pd.DataFrame(columns=['text', 'sentiment'])

In [13]:
all_text = tsad_df['text'].values

In [14]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import torch
import numpy as np
from scipy.special import softmax

MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

def get_sentiment(text):
    
    encoded_input = tokenizer(text, return_tensors="pt")
    
    with torch.no_grad():
        output = model(**encoded_input)

    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    return config.id2label[ranking[0]].lower()

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Get more positive data

In [15]:
pos_df = additional_df[additional_df['polarity of tweet'] == 4].reset_index(drop=True)

In [16]:
idx = 0
for i in range(pos_n):
    if (i + 1)%100 == 0:
        print("Working on row", i + 1)
    text = None
    while True:
        text = pos_df.iloc[idx]['text of the tweet']
        # text = re.sub(r"^(@\w+\s+)+", "", text)
        if text.strip() == "" or text in all_text:
            idx += 1
            continue
        if get_sentiment(text) == 'positive':
            idx += 1
            break
        else:
            idx += 1
            continue
    df.loc[len(df)] = [text, 'positive']

Working on row 100
Working on row 200
Working on row 300
Working on row 400
Working on row 500
Working on row 600
Working on row 700
Working on row 800
Working on row 900
Working on row 1000
Working on row 1100
Working on row 1200
Working on row 1300
Working on row 1400
Working on row 1500
Working on row 1600
Working on row 1700
Working on row 1800
Working on row 1900
Working on row 2000
Working on row 2100
Working on row 2200
Working on row 2300
Working on row 2400
Working on row 2500
Working on row 2600
Working on row 2700
Working on row 2800
Working on row 2900
Working on row 3000
Working on row 3100
Working on row 3200
Working on row 3300
Working on row 3400
Working on row 3500
Working on row 3600
Working on row 3700
Working on row 3800
Working on row 3900
Working on row 4000
Working on row 4100
Working on row 4200
Working on row 4300
Working on row 4400
Working on row 4500
Working on row 4600
Working on row 4700
Working on row 4800
Working on row 4900
Working on row 5000
Working o

In [17]:
df.to_csv("additional_ds.csv", encoding="latin1", index=False)

# Get more negative data

In [18]:
neg_df = additional_df[additional_df['polarity of tweet'] == 0].reset_index(drop=True)

In [19]:
idx = 0
for i in range(neg_n):
    if (i + 1)%100 == 0:
        print("Working on row", i + 1)
    text = None
    while True:
        text = neg_df.iloc[idx]['text of the tweet']
        # text = re.sub(r"^(@\w+\s+)+", "", text)
        if text.strip() == "" or text in all_text:
            idx += 1
            continue
        if get_sentiment(text) == 'negative':
            idx += 1
            break
        else:
            idx += 1
            continue
    df.loc[len(df)] = [text, 'negative']

Working on row 100
Working on row 200
Working on row 300
Working on row 400
Working on row 500
Working on row 600
Working on row 700
Working on row 800
Working on row 900
Working on row 1000
Working on row 1100
Working on row 1200
Working on row 1300
Working on row 1400
Working on row 1500
Working on row 1600
Working on row 1700
Working on row 1800
Working on row 1900
Working on row 2000
Working on row 2100
Working on row 2200
Working on row 2300
Working on row 2400
Working on row 2500
Working on row 2600
Working on row 2700
Working on row 2800
Working on row 2900
Working on row 3000
Working on row 3100
Working on row 3200
Working on row 3300
Working on row 3400
Working on row 3500
Working on row 3600
Working on row 3700
Working on row 3800
Working on row 3900
Working on row 4000
Working on row 4100
Working on row 4200
Working on row 4300
Working on row 4400
Working on row 4500
Working on row 4600
Working on row 4700
Working on row 4800
Working on row 4900
Working on row 5000
Working o

In [20]:
df.to_csv("additional_ds.csv", encoding="latin1", index=False)

# Get more neutral data

In [21]:
import pandas as pd
from deep_translator import GoogleTranslator
import nltk
from nltk.corpus import wordnet
import random
from nltk.corpus import stopwords

## Back translation

In [22]:
def back_translation(text, src_lang="en", mid_lang="fr"):
    translated = GoogleTranslator(source=src_lang, target=mid_lang).translate(text)
    back_translated = GoogleTranslator(source=mid_lang, target=src_lang).translate(translated)
    return back_translated

In [23]:
back_translation('I`d have responded, if I were going')

'I would have answered, if I was going'

In [24]:
back_translation('http://www.dothebouncy.com/smf - some shameless plugging for the best Rangers forum on earth')

'http://www.dothebouncy.com/smf - a little shameless taking for the forum of the best rangers on earth'

In [51]:
neu_df = tsad_df[tsad_df['sentiment'] == 'neutral'].reset_index(drop=True)

In [52]:
new_neu = []

In [53]:
idx = 0
for i in range(neu_n):
    if (i + 1)%100 == 0:
        print("Working on row", i + 1)
    text = None
    while True:
        og_text = neu_df.iloc[idx]['text']
        try:
            text = back_translation(og_text)
            if text in all_text or text in new_neu:
                idx += 1
                continue
            if get_sentiment(text) == 'neutral':
                new_neu.append(text)
                idx += 1
                break
            else:
                idx += 1
                continue
        except Exception as e: 
            idx += 1
            continue
    df.loc[len(df)] = [text, 'neutral']

Working on row 100
Working on row 200
Working on row 300
Working on row 400
Working on row 500
Working on row 600
Working on row 700
Working on row 800
Working on row 900
Working on row 1000
Working on row 1100
Working on row 1200
Working on row 1300
Working on row 1400
Working on row 1500
Working on row 1600
Working on row 1700
Working on row 1800
Working on row 1900
Working on row 2000
Working on row 2100
Working on row 2200
Working on row 2300
Working on row 2400
Working on row 2500
Working on row 2600
Working on row 2700
Working on row 2800
Working on row 2900
Working on row 3000
Working on row 3100
Working on row 3200
Working on row 3300
Working on row 3400
Working on row 3500
Working on row 3600
Working on row 3700
Working on row 3800
Working on row 3900
Working on row 4000
Working on row 4100
Working on row 4200
Working on row 4300
Working on row 4400
Working on row 4500
Working on row 4600
Working on row 4700
Working on row 4800
Working on row 4900
Working on row 5000
Working o

In [None]:
df = df.drop_duplicates()

In [93]:
df.to_csv("additional_ds.csv", encoding="latin1", index=False)

## Synonym replacement

In [None]:
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
stop_words = []
for w in stopwords.words('english'):
    stop_words.append(w)

In [None]:
def get_synonyms(word):
    
    synonyms = set()
    
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

In [None]:
def synonym_replacement(words):
    
    words = words.split()
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    n = random.randint(1, len(random_word_list))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)

    return sentence

In [None]:
synonym_replacement('http://www.dothebouncy.com/smf - some shameless plugging for the best Rangers forum on earth')