In [2]:
import pandas as pd
import string
import re
import pickle

from nltk.corpus import stopwords

## Define Functions to clean text ##

1. Remove strange characters
2. Remove punctuations
3. Change all characters to lower case
4. Remove stopwords
5. Remove numerical characters

In [3]:
def mod_code(text):
    text = re.sub(r'[^\x00-\x7f]',r'', text)
    return text

def mod_char(text):
    text = " ".join(word.replace('[^\w\s]',' ') for word in text.split())
    return text

def mod_punctuations(text):
    for punctuation in string.punctuation:
        text = " ".join(word.replace(punctuation, '') for word in text.split())
    return text

def mod_lower(text):
    text = text.lower()
    return text

stop = stopwords.words('english')
def mod_stop(text):
    text = ' '.join(word for word in text.split() if word not in stop)
    return text

def rem_num(text):
    text = ' '.join(word for word in text.split() if not word.isdigit())
    return text

def clean_df(df,col):
    df[col] = df[col].apply(rem_num)
    df[col] = df[col].apply(mod_char)
    df[col] = df[col].apply(mod_punctuations)
    df[col] = df[col].apply(mod_lower)
    df[col] = df[col].apply(mod_stop)
    df[col] = df[col].apply(mod_code)
    
def clean_txt(text):
    text = mod_char(text)
    text = mod_punctuations(text)
    text = mod_lower(text)
    text = mod_code(text)
    text = mod_stop(text)
    
    return text

In [6]:
fairy = pd.read_pickle('fairy.pkl')
fairy.head()

Unnamed: 0,labels,texts
0,neutral,NEAR the grass-covered rampart which encircles...
1,neutral,Balsams and other flowers greet us from the lo...
2,neutral,The building is the Warton Almshouse.
3,neutral,Look! at the window there leans an old maid.
4,sadness,"She plucks the withered leaf from the balsam, ..."


In [7]:
fairy.to_csv("fairy.csv")

In [5]:
clean_df(fairy,'texts')
fairy.head()

Unnamed: 0,labels,texts
0,neutral,near grasscovered rampart encircles copenhagen...
1,neutral,balsams flowers greet us long rows windows hou...
2,neutral,building warton almshouse
3,neutral,look window leans old maid
4,sadness,plucks withered leaf balsam looks grasscovered...


In [110]:
pd.value_counts(fairy['labels'])

neutral     10139
joy          1610
surprise      832
sadness       831
anger         730
fear          697
disgust       463
Name: labels, dtype: int64

In [232]:
isear = pd.read_pickle("isear.pkl")
isear.head()

Unnamed: 0,labels,texts
0,joy,days feel close partner friends feel peace als...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treated possibility elucida...
3,sadness,think short time live relate periods life thin...
4,disgust,gathering found involuntarily sitting next two...


In [233]:
pd.value_counts(isear['labels'])

joy        1092
sadness    1082
anger      1079
fear       1076
shame      1071
disgust    1066
guilt      1050
Name: labels, dtype: int64

In [245]:
df = fairy.append(isear)
df.head()

Unnamed: 0,labels,texts
0,neutral,near grasscovered rampart encircles copenhagen...
1,neutral,balsams flowers greet us long rows windows hou...
2,neutral,building warton almshouse
3,neutral,look window leans old maid
4,sadness,plucks withered leaf balsam looks grasscovered...


In [246]:
df = df.reset_index(drop=True)
pd.value_counts(df['labels'])

neutral     10139
joy          2702
sadness      1913
anger        1809
fear         1773
disgust      1529
shame        1071
guilt        1050
surprise      832
Name: labels, dtype: int64

In [247]:
subset1 = df[df['labels'] == 'neutral'].sample(frac=.8,replace=False)
df = df.drop(subset1.index,axis=0)
subset2 = df[df['labels'] == 'joy'].sample(frac=.2,replace=False)
df = df.drop(subset2.index,axis=0)

In [248]:
pd.value_counts(df['labels'])

joy         2162
neutral     2028
sadness     1913
anger       1809
fear        1773
disgust     1529
shame       1071
guilt       1050
surprise     832
Name: labels, dtype: int64

In [249]:
df['labels'] = df['labels'].apply(lambda x: 'shame-guilt' if x == 'guilt' else x)
df['labels'] = df['labels'].apply(lambda x: 'shame-guilt' if x == 'shame' else x)

df['labels'] = df['labels'].apply(lambda x: 'fear' if x == 'surprise' else x)

In [250]:
pd.value_counts(df['labels'])

fear           2605
joy            2162
shame-guilt    2121
neutral        2028
sadness        1913
anger          1809
disgust        1529
Name: labels, dtype: int64

In [251]:
subset2 = df[df['labels'] == 'fear'].sample(frac=.2,replace=False)
df = df.drop(subset2.index,axis=0)

In [252]:
pd.value_counts(df['labels'])

joy            2162
shame-guilt    2121
fear           2084
neutral        2028
sadness        1913
anger          1809
disgust        1529
Name: labels, dtype: int64

In [253]:
df.to_pickle("df.pkl")