In [1]:
import pandas as pd
import re

# Zero is negative, one is positive
def classifier(x):
    if x == 0:
        return 0
    elif x == 4:
        return 1
    else:
        raise Exception('Unknown class!')

def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+(_[A-Za-z0-9]*)*', '', text)  # Remove tags
    text = re.sub(r'#', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)  # Remove links
    text = re.sub(r'www\.\S+\.\S+', '', text)
    text = re.sub(r'[^\w\s]', '', text)  # Remove non-aphabetic characters
    return text

In [2]:
### Load data
data = pd.read_csv('source.csv', encoding = "ISO-8859-1")

In [3]:
### Remove unwanted data
clean = pd.DataFrame({'target': data.iloc[:, 0].map(classifier), 'text': data.iloc[:, 5]})

### Clean with regular expressions
clean['text'] = clean['text'].apply(clean_text)

### Drop duplicates
clean = clean.drop_duplicates(subset=['text'])

In [6]:
# Balancing classes

### Split by class
data0 = clean[clean.target == 0]
data1 = clean[clean.target == 1]

### Shuffle to mitigate "the worst case"
data0 = data0.sample(frac=1)
data1 = data1.sample(frac=1)

# number_of_rows = min(data0.shape[0], data1.shape[0])

# For test now
number_of_rows = 50000


### Separate for train and test

### Make them of the same length
train0 = data0.head(number_of_rows)
train1 = data1.head(number_of_rows)

test0 = data0.iloc[number_of_rows:2*number_of_rows]
test1 = data1.iloc[number_of_rows:2*number_of_rows]

### Put them back to one dataset
train = pd.concat([train0, train1])
test = pd.concat([test0, test1])

train.to_csv('train.csv', index = False, encoding = 'utf-8')
test.to_csv('test.csv', index = False, encoding = 'utf-8')