# Imports

In [2]:
from sklearn.utils import shuffle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd
import os

# Data preparation

In [9]:
data=pd.read_csv("data/processed_train.csv")

In [10]:
train, test = train_test_split(data, test_size=0.2)

train_x = list(train["processed_tweet"])
test_x = list(test["processed_tweet"])
train_y = list(train["label"])
test_y = list(test["label"])

In [11]:
vocabulary_size = 6000
vectorizer = CountVectorizer(max_features=vocabulary_size)
features_train = vectorizer.fit_transform(train_x).toarray()
features_test = vectorizer.transform(test_x).toarray()
vocabulary = vectorizer.vocabulary_

In [12]:
train_x = features_train
test_x = features_test

Create the validation set

In [13]:
len_val = int(len(train_x)*0.2)

val_x = pd.DataFrame(train_x[:len_val])
train_x = pd.DataFrame(train_x[len_val:])

val_y = pd.DataFrame(train_y[:len_val])
train_y = pd.DataFrame(train_y[len_val:])

A first directory will be created for data prepared. Several will be done with different balancing techniques.

In [None]:
version = input("What version of the prepared data is it?:")

In [15]:
data_dir = f"data_prepared_{version}"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [16]:
pd.DataFrame(test_x).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)
pd.DataFrame(test_y).to_csv(os.path.join(data_dir, 'test_y.csv'), header=False, index=False)

pd.concat([val_y, val_x], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([train_y, train_x], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [17]:
test_X = train_X = val_X = train_y = val_y = None

Upload the data to S3 storage

In [21]:
import sagemaker

session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

s3_folder = {f"model_{version}":{"test":test_location,"val":val_location,"train":train_location}}

import json
with open("data/s3_folders.json", "a+") as f:
    json.dump(s3_folder,f)