# Imports

We import all libraries.

In [1]:
from sklearn.utils import shuffle, resample
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd
import os

# Data preparation

For the model to be used, a vectorization of the tweets will be done. This because the argorithm to be used does not work with words but with matrices.
A Countvectorizer will be performed. For this to be, the data will be split into train and testing. Right after, the vectorizer will be performed using the training set, for then transform the test set using the dictionary created by the training set. 
For the Vectorizer to be performed, the train and test data set need to be lists of words. 

First the data that has already been cleaned in the exploration step is loaded.

In [2]:
data=pd.read_csv("data/processed_train.csv")

### 1. Not balancing the data

Splitting the data into train and test datasets.

In [3]:
train, test = train_test_split(data, test_size=0.2, random_state = 1) #control the random sampling to get reproductible results

In [8]:
print("Labels for testing dataset:\n",test.label.value_counts(),"\n")
print("Labels for training dataset:\n",train.label.value_counts())

Labels for testing dataset:
 0    5951
1     442
Name: label, dtype: int64 

Labels for training dataset:
 0    23769
1     1800
Name: label, dtype: int64


Now the datasets will be transformed into lists so they can be used as parameters for the vectorizer. 

The labels are separated from the tweets.

In [10]:
train_x = list(train["processed_tweet"])
test_x = list(test["processed_tweet"])

train_y = list(train["label"])
test_y = list(test["label"])

Now a vectorizer will be fit using a dictionary of 6.000 words. It will be fit using the training data set and then will be used to transform the test dataset.

In [11]:
vocabulary_size = 6000
vectorizer = CountVectorizer(max_features=vocabulary_size)
features_train = vectorizer.fit_transform(train_x).toarray()
features_test = vectorizer.transform(test_x).toarray()
vocabulary = vectorizer.vocabulary_

In [12]:
train_x = features_train #Now the train and test dataframes X are not processed tweets by arrays of numbers. 
test_x = features_test

A validation set will be created out of the training set so it can be passed to the algorithm in sagemaker. 

In [13]:
len_val = int(len(train_x)*0.2) #the length of the validation dataset will be 20% of that of the whole training set.

val_x = pd.DataFrame(train_x[:len_val])
train_x = pd.DataFrame(train_x[len_val:])

val_y = pd.DataFrame(train_y[:len_val])
train_y = pd.DataFrame(train_y[len_val:])

A first directory will be created for data prepared. Several will be done with different balancing techniques.

In [None]:
version = input("What version of the prepared data is it?:") #this version variable will be set several times for all versions of the balancing data. 

In [15]:
data_dir = f"data_prepared_{version}" #a new folder will be created keeping in mind the version of the different balancing of the data.
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [16]:
#all data frames will be saved in the newly created folder.

pd.DataFrame(test_x).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False) 
pd.DataFrame(test_y).to_csv(os.path.join(data_dir, 'test_y.csv'), header=False, index=False)

pd.concat([val_y, val_x], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([train_y, train_x], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [17]:
test_X = train_X = val_X = train_y = val_y = None 
#given the amount of resources the dataframes take, it is a good idea to delete them from the Ram after they have been permanently saved.

Finally, the training and validation as well as the test (the X part) will be ppload the data to S3 storage. From there they are to be called to be used in the training of the model.

In [21]:
import sagemaker #The sagemaker module will be used to upload the data to s3.

session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

#the location of the information inside of S3 is saved so it can be referenced later. It is saved as string at the same time as is uploaded.
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix) 
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

#The location for all the data is permanently saved as a json file inside the folder so it can be accessed by other modules.
s3_folder = {f"model_{version}":{"test":test_location,"val":val_location,"train":train_location}}

import json
with open("data/s3_folders.json", "a+") as f:
    json.dump(s3_folder,f)

### 2. Undersampling technique.