In [2]:
import pandas as pd

In [3]:
def load_raw_data(path: str) -> pd.DataFrame:
    """Create a Dataframe containing each tweet

    Args:
        path (str): The path of the file (.txt) to load tweets from

    Returns:
        DataFrame: a Dataframe with one row per tweet
    """
    data = []
    with open(path) as file:
        for line in file:
            data.append(line)
    data_df = pd.DataFrame(data, columns = {'tweet'})
    return data_df

In [5]:
df = load_raw_data('/Volumes/MarcWatineHD/ETH/CIL/project_data/twitter-datasets/train_pos.txt')

In [7]:
a = pd.concat([df, df], ignore_index=True, sort=False)


In [8]:
a

Unnamed: 0,tweet
0,<user> i dunno justin read my mention or not ....
1,"because your logic is so dumb , i won't even c..."
2,""" <user> just put casper in a box ! "" looved t..."
3,<user> <user> thanks sir > > don't trip lil ma...
4,visiting my brother tmr is the bestest birthda...
...,...
199995,<user> hey gina what's up ?\n
199996,"<user> sas 9.1 . 3 and 9.2 , east 5 , s-plus 8..."
199997,<user> <user> um gord ... i just read your pro...
199998,<user> i'm so excited for tomorrow ! look out ...


In [9]:
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']

In [10]:
def get_synonyms(word):
    """
    Get synonyms of a word
    """
    synonyms = set()
    
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)

In [11]:
def synonym_replacement(words, n):
    
    words = words.split()
    
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word not in stop_words]))
    random.shuffle(random_word_list)
    num_replaced = 0
    
    for random_word in random_word_list:
        synonyms = get_synonyms(random_word)
        
        if len(synonyms) >= 1:
            synonym = random.choice(list(synonyms))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        
        if num_replaced >= n: #only replace up to n words
            break

    sentence = ' '.join(new_words)

    return sentence

In [12]:
import random
from random import shuffle
random.seed(1)

In [13]:
from nltk.corpus import wordnet

In [16]:
def data_augmentation(tweet: str) -> str:
    """ Creates a new tweet replacing a word by its synonym using the Thesaurus-based subsititution

    Args:
        tweet (string): tweet as string

    Returns:
        string: new augmented tweet
    """
    # try with 2 words instead of 3 as well, might be better?
    augmented_tweet = synonym_replacement(tweet, 3)
    return augmented_tweet

In [38]:
augmentation = pd.DataFrame(columns = ['tweet'])

In [39]:
augmentation

Unnamed: 0,tweet


In [40]:
augmentation['tweet'] = df['tweet'].apply(lambda row: data_augmentation((str(row))))

In [42]:
b = pd.concat([df, augmentation], ignore_index=True, sort=False)



In [27]:
augmentation = augmentation.rename(columns={'0': 'tweet'})

In [50]:
df.iloc[10000]

tweet    i love our fans ! there all so beautiful ! x <...
Name: 10000, dtype: object

In [41]:
augmentation

Unnamed: 0,tweet
0,<user> i dunno justin register my mention or n...
1,"because your logic is so dumb , i won't even c..."
2,""" <user> just position casper in a box seat ! ..."
3,<user> <user> thank sir > > don't trip lil mum...
4,chat my chum tmr is the better birthday gift e...
...,...
99995,<user> hey gina what's up ?
99996,"<user> sas 9.1 . 3 and 9.2 , east pentad , s-p..."
99997,<user> <user> um gord ... i just read your vis...
99998,<user> i'm so excited for tomorrow ! take care...


In [34]:
type(augmentation)

pandas.core.series.Series

In [43]:
b

Unnamed: 0,tweet
0,<user> i dunno justin read my mention or not ....
1,"because your logic is so dumb , i won't even c..."
2,""" <user> just put casper in a box ! "" looved t..."
3,<user> <user> thanks sir > > don't trip lil ma...
4,visiting my brother tmr is the bestest birthda...
...,...
199995,<user> hey gina what's up ?
199996,"<user> sas 9.1 . 3 and 9.2 , east pentad , s-p..."
199997,<user> <user> um gord ... i just read your vis...
199998,<user> i'm so excited for tomorrow ! take care...


In [51]:
df

Unnamed: 0,tweet
0,<user> i dunno justin read my mention or not ....
1,"because your logic is so dumb , i won't even c..."
2,""" <user> just put casper in a box ! "" looved t..."
3,<user> <user> thanks sir > > don't trip lil ma...
4,visiting my brother tmr is the bestest birthda...
...,...
99995,<user> hey gina what's up ?\n
99996,"<user> sas 9.1 . 3 and 9.2 , east 5 , s-plus 8..."
99997,<user> <user> um gord ... i just read your pro...
99998,<user> i'm so excited for tomorrow ! look out ...


In [52]:
df['tweet'] = augmentation['tweet']

In [54]:
df = augmentation

In [56]:
df = b

In [57]:
df

Unnamed: 0,tweet
0,<user> i dunno justin read my mention or not ....
1,"because your logic is so dumb , i won't even c..."
2,""" <user> just put casper in a box ! "" looved t..."
3,<user> <user> thanks sir > > don't trip lil ma...
4,visiting my brother tmr is the bestest birthda...
...,...
199995,<user> hey gina what's up ?
199996,"<user> sas 9.1 . 3 and 9.2 , east pentad , s-p..."
199997,<user> <user> um gord ... i just read your vis...
199998,<user> i'm so excited for tomorrow ! take care...
