In [1]:
# setup for Colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
!pip install contractions

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting contractions
  Downloading contractions-0.1.72-py2.py3-none-any.whl (8.3 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 14.1 MB/s 
[?25hCollecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
[K     |████████████████████████████████| 287 kB 45.9 MB/s 
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.72 pyahocorasick-1.4.4 textsearch-0.0.24


In [3]:
import pandas as pd
import nltk
import contractions

In [4]:
root_dir = '/content/gdrive/MyDrive/oc_projet_7'
data_path = root_dir + '/data/' + 'training.1600000.processed.noemoticon.csv'

In [5]:
# initialize nltk
download_list = ['wordnet', 
                 'omw-1.4', 
                ]

for asset in download_list:
    nltk.download(asset)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [6]:
test_mode = False
test_mode_samples = 1000

# load original df
column_names = ['label', 'id', 'date', 'flag', 'user', 'text']
df = pd.read_csv(data_path, encoding='latin-1', header=None, names=column_names)

if test_mode:
    # keep only a sample to increase computation speed
    df = df.sample(n=test_mode_samples, random_state=1)
df

Unnamed: 0,label,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...


In [7]:
#TODO more preprocessing

def lower_case(text):
    new_text = text.lower()
    return new_text


def remove_contractions(text):
    new_text = contractions.fix(text, slang=True)
    return new_text


def tokenize(text):
    from nltk.tokenize import TweetTokenizer
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(text)
    return tokens


#TODO transform numbers into 'number_token'
#               tweet mentions into 'mention_token'
#               name into 'name_token'
#               etc...
def various_normalizations(tokens):
    new_tokens = []

    for token in tokens:
        try:
            float(token)
            is_number = True
        except:
            is_number = False

        if is_number:
            new_tokens.append('<number>')
        elif token.startswith('@'):
            new_tokens.append('<user>')
        elif token.startswith('http') or token.startswith('www'):
            new_tokens.append('<url>')
        else:
            new_tokens.append(token)
    return new_tokens


def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    new_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return new_tokens



In [8]:
def preprocessing(df, func_list, exemple_index):

    data = df.copy(deep=True)
    for func in func_list:
        print(func.__name__.upper(), ':')
        data['text'] = data['text'].apply(func)
        print('\n', data['text'].loc[exemple_index], '\n'*2)
    
    return data

In [9]:
# create preprocessed dataset
exemple_index = 610789

func_list = [lower_case, 
             remove_contractions, 
             tokenize, 
             various_normalizations, 
             lemmatize
             ]

to_keep = ['text', 'label']
df_processed = preprocessing(df[to_keep], func_list, exemple_index)

LOWER_CASE :

 @misskeribaby wish i was in la right now  


REMOVE_CONTRACTIONS :

 @misskeribaby wish i was in la right now  


TOKENIZE :

 ['@misskeribaby', 'wish', 'i', 'was', 'in', 'la', 'right', 'now'] 


VARIOUS_NORMALIZATIONS :

 ['<user>', 'wish', 'i', 'was', 'in', 'la', 'right', 'now'] 


LEMMATIZE :

 ['<user>', 'wish', 'i', 'wa', 'in', 'la', 'right', 'now'] 




In [10]:
save = True
save_path = root_dir + '/data/' + 'df_preprocessed_2' + '.pkl'

if save:
    # save and load back df to see if it worked
    df_processed.to_pickle(save_path)
    df_processed = pd.read_pickle(save_path)
df_processed

Unnamed: 0,text,label
0,"[<user>, <url>, -, awww, ,, that, is, a, bumme...",0
1,"[is, upset, that, he, cannot, update, his, fac...",0
2,"[<user>, i, dived, many, time, for, the, ball,...",0
3,"[my, whole, body, feel, itchy, and, like, it, ...",0
4,"[<user>, no, ,, it, is, not, behaving, at, all...",0
...,...,...
1599995,"[just, woke, up, ., having, no, school, is, th...",4
1599996,"[thewdb.com, -, very, cool, to, hear, old, wal...",4
1599997,"[are, you, ready, for, your, mojo, makeover, ?...",4
1599998,"[happy, 38th, birthday, to, my, boo, of, alll,...",4


In [11]:
# only positive (4) and negative (0) texts
df_processed['label'].value_counts()

0    800000
4    800000
Name: label, dtype: int64

In [12]:
df_processed['text'].iloc[0]

['<user>',
 '<url>',
 '-',
 'awww',
 ',',
 'that',
 'is',
 'a',
 'bummer',
 '.',
 'you',
 'shoulda',
 'got',
 'david',
 'carr',
 'of',
 'third',
 'day',
 'to',
 'do',
 'it',
 '.',
 ';d']