In [14]:
import pandas as pd
import pathlib
import numpy as np
from fastai.text import *

In [15]:
CLAS_PATH = pathlib.Path("test/")
CLAS_PATH.mkdir(exist_ok=True)

In [16]:
BOS = 'xbos'  # beginning-of-sentence tag
FLD = 'xfld'  # data field tag

In [17]:
df_clas_data = pd.read_csv(CLAS_PATH/"downloaded.tsv", sep="\t", header=None)
df_clas_data = df_clas_data[[1,4]]
df_clas_data.columns = ['sentiment', 'tweet_text']

df_clas_data.applymap(lambda x: x.strip() if type(x) is str else x)
df_clas_data = df_clas_data[df_clas_data.tweet_text.str.contains("Not Available") == False]

mapping = {'positive': 1.0, 'negative': -1.0, 'neutral': 0}
df_clas_data = df_clas_data.replace({'sentiment': mapping})

#df_clas_data['sentiment'] = df_clas_data['sentiment'].str.strip()
df_clas_data['sentiment'] = df_clas_data['sentiment'].astype(np.float32)

In [18]:
import re

import re
import sys


def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word


def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet


def preprocess_tweet(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()

    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            processed_tweet.append(word)

    return ' '.join(processed_tweet)

In [19]:
df_clas_data['tweet_text'] = df_clas_data['tweet_text'].apply(lambda x: preprocess_tweet(x))

In [20]:
df_clas_data

Unnamed: 0,sentiment,tweet_text
0,1.0,USER_MENTION so ein hearthstonekey von USER_ME...
3,0.0,tainted talents ateliertagebuch wir sind nicht...
4,0.0,aber wenigstens kommt supernatural heute mal w...
6,0.0,darlehen angebot schufafreie darlehen anbieter...
7,0.0,anruf hardcore teeny vicky carrera hallo mein ...
8,0.0,na wo sind frankens heimliche talente die erst...
9,1.0,breitet sich aus
10,-1.0,USER_MENTION unachtsam
11,0.0,jobs sales medien tv tele m1 sucht werbe verka...
13,0.0,happy halloween liebe studenten zeigt eure gru...


Manually trim dataset for equal size of classes (for debug purposes)

In [21]:
#a = df_clas_data.loc[df_clas_data['sentiment'] == -1.0]
#a = a.head(1000)
#b = df_clas_data.loc[df_clas_data['sentiment'] == 1.0]
#b = b.head(1000)
#c = df_clas_data.loc[df_clas_data['sentiment'] == 0]
#c = c.head(1000)
#frames = [a, b, c]
#frames = [a, b]
#result = pd.concat(frames)
#df_clas_data  = result

In [22]:
#remove neutral comments
#df_clas_data = df_clas_data[df_clas_data['sentiment'] != 0].reset_index(drop=True)
df_clas_data.to_csv(CLAS_PATH/"German_Sentiment_Data.csv", index=False)

# Creating train and validation sets
np.random.seed(42)
trn_keep = np.random.rand(len(df_clas_data))>0.1
df_trn = df_clas_data[trn_keep]
df_val = df_clas_data[~trn_keep]

# Saving train and validation sets to disk
df_trn.to_csv(CLAS_PATH/"German_Sentiment_Data_Train.csv", header=None, index=False)
df_val.to_csv(CLAS_PATH/"German_Sentiment_Data_Test.csv", header=None, index=False)

len(df_trn),len(df_val)

(6382, 748)

In [23]:
chunksize = 10000
df_trn = pd.read_csv(CLAS_PATH/"German_Sentiment_Data_Train.csv", header=None, chunksize=chunksize)
df_val = pd.read_csv(CLAS_PATH/"German_Sentiment_Data_Test.csv", header=None, chunksize=chunksize)

In [24]:
def get_texts(df, n_lbls=1):
    labels = df.iloc[:,range(n_lbls)].values.astype(np.int64)
    #print(labels)
    
    texts = f'\n{BOS} {FLD} 1 ' + df[n_lbls].astype(str)
    
    for i in range(n_lbls+1, len(df.columns)): texts += f' {FLD} {i-n_lbls} ' + df[i].astype(str)
    #texts = texts.apply(fixup).values.astype(str)

    tok = Tokenizer().proc_all_mp(partition_by_cores(texts)) # splits the list into sublists for processing by each core
    # Lower and upper case is inside the tokenizer
    return tok, list(labels)

def get_all(df, n_lbls):
    tok, labels = [], []
    for i, r in enumerate(df):
        print(i)
        #pdb.set_trace()
        tok_, labels_ = get_texts(r, n_lbls)
        tok += tok_;
        labels += labels_
        
    return tok, labels

In [25]:
tok_trn, trn_labels = get_all(df_trn, 1)
tok_val, val_labels = get_all(df_val, 1)

0
0


In [26]:
len(trn_labels)

6382