In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sn
from sklearn.model_selection import train_test_split
import hiddenlayer as hl

from collections import defaultdict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import Optimizer

import torchtext
from torchtext import data

In [2]:
dataset = pd.read_csv("training.1600000.processed.noemoticon.csv", names=['target', 'ids', 'date', 'flag', 'user', 'text'],
                encoding='latin-1', error_bad_lines=False)

In [3]:
dataset.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
dataset = dataset.sample(n=100000)
dataset.target.value_counts()

0    50086
4    49914
Name: target, dtype: int64

In [5]:
raw_data = dataset[['text', 'target']].copy(deep=True)

In [6]:
raw_data

Unnamed: 0,text,target
261453,@missalywilliams will be at my cousin's weddin...,0
1120950,Bounce magic,4
1310003,Created a look on looklet.com. http://looklet....,4
908350,Apparently today is Happy Star Wars day. I'm ...,4
986322,@hiyer ya wil call you in a bit,4
...,...,...
848412,"@Princess_Han Yeah, that's cool",4
1132570,@WendyDavie glad you are back!,4
1250074,Had an appointment. Now going down 85 through ...,4
561075,is injured. Hahah. http://plurk.com/p/11lqjs,0


In [7]:
def clean(text):
    USER = '@[\w_]+'
    LINK = 'https?:\/\/\S+'
    HASHTAG = '#\S+'
    NUMBER = '\d+'
    PUNCTUATIONS = '[\.?!,;:\-\[\]\{\}\(\)\'\"/]'

    user_sub = re.sub(USER, ' <user> ', text)
    link_sub = re.sub(LINK, ' <url> ', user_sub)
    hashtag_sub = re.sub(HASHTAG, ' <hashtag> ', link_sub)
    number_sub = re.sub(NUMBER, ' <number> ', hashtag_sub)
    clean_text = re.sub(PUNCTUATIONS, ' ', number_sub)

    return clean_text.lower()

In [8]:
raw_data['text'] = raw_data['text'].apply(lambda x: clean(x))

In [9]:
raw_data['target'] = raw_data['target'].replace(4, 1)

In [10]:
train, test = train_test_split(raw_data, test_size=0.20)

In [11]:
len(train)

80000

In [12]:
train.to_csv('./data/train.csv', index=False)
test.to_csv('./data/test.csv', index=False)