In [1]:
import collections
import numpy as np
import pandas as pd
import re

from argparse import Namespace

In [25]:
args = Namespace(
    raw_dataset_csv="./Mental-Health-Twitter.csv",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="./twitter_new.csv",
    seed=1337
)

In [8]:
twitter_df = pd.read_csv(args.raw_dataset_csv, header=0)
twitter_df.head()

Unnamed: 0.1,Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1
1,1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1
2,2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1
3,3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1013187241,84,211,251,837,2,1
4,4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1013187241,84,211,251,837,1,1


In [10]:
twitter_df.drop(['post_id', 'post_created', 'user_id', 'followers', 'friends', 'favourites', 'statuses', 'retweets'],axis = 1, inplace = True)
twitter_df.head()

Unnamed: 0.1,Unnamed: 0,post_text,label
0,0,It's just over 2 years since I was diagnosed w...,1
1,1,"It's Sunday, I need a break, so I'm planning t...",1
2,2,Awake but tired. I need to sleep but my brain ...,1
3,3,RT @SewHQ: #Retro bears make perfect gifts and...,1
4,4,It’s hard to say whether packing lists are mak...,1


In [12]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)     
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
    
twitter_df.post_text = twitter_df.post_text.apply(preprocess_text)
twitter_df.head()

Unnamed: 0.1,Unnamed: 0,post_text,label
0,0,it s just over years since i was diagnosed wit...,1
1,1,"it s sunday , i need a break , so i m planning...",1
2,2,awake but tired . i need to sleep but my brain...,1
3,3,rt sewhq retro bears make perfect gifts and ar...,1
4,4,it s hard to say whether packing lists are mak...,1


In [None]:
# making the subset equal across the review classes
by_label = collections.defaultdict(list)
for _, row in twitter_df.iterrows():
    by_label[row.label].append(row.to_dict())

In [19]:
twitter_df.label.value_counts()

1    10000
0    10000
Name: label, dtype: int64

In [20]:
set(twitter_df.label)

{0, 1}

In [21]:
# Splitting the subset by rating to create our new train, val, and test splits
by_label = collections.defaultdict(list)
for _, row in twitter_df.iterrows():
    by_label[row.label].append(row.to_dict())
    
final_list = []
np.random.seed(args.seed)

for _, item_list in sorted(by_label.items()):

    np.random.shuffle(item_list)
    
    n_total = len(item_list)
    n_train = int(args.train_proportion * n_total)
    n_val = int(args.val_proportion * n_total)
    n_test = int(args.test_proportion * n_total)
    
    # Give data point a split attribute
    for item in item_list[:n_train]:
        item['split'] = 'train'
    
    for item in item_list[n_train:n_train+n_val]:
        item['split'] = 'val'
        
    for item in item_list[n_train+n_val:n_train+n_val+n_test]:
        item['split'] = 'test'

    # Add to final list
    final_list.extend(item_list) # use extend when adding multiple elements to a list; use append for additing one element.

In [22]:
final_twitter = pd.DataFrame(final_list)
final_twitter.split.value_counts()

train    14000
test      3000
val       3000
Name: split, dtype: int64

In [23]:
final_twitter.head()

Unnamed: 0.1,Unnamed: 0,post_text,label,split
0,19927,try to have as good a life as you can under th...,0,train
1,13245,realdonaldtrump full of yourself ? ? remember...,0,train
2,12303,politicomag o man woman if we had mr . truman...,0,train
3,17707,two and a half hours late to work,0,train
4,18820,michael tarallo is now following me on twitter...,0,train


In [26]:
final_twitter.to_csv(args.output_munged_csv, index=False)