# Imports

We import all libraries.

In [1]:
from sklearn.utils import shuffle, resample
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
import pandas as pd
import os

# Data preparation

For the model to be used, a vectorization of the tweets will be done. This because the argorithm to be used does not work with words but with matrices.
A Countvectorizer will be performed. For this to be, the data will be split into train and testing. Right after, the vectorizer will be performed using the training set, for then transform the test set using the dictionary created by the training set. 
For the Vectorizer to be performed, the train and test data set need to be lists of words. 

First the data that has already been cleaned in the exploration step is loaded.

In [2]:
data=pd.read_csv("data/processed_train.csv")

In [3]:
data = shuffle(data,random_state =1) #data will be randomly shuffled so then when resampling the data it can be 

### 1. Not balancing the data

Splitting the data into train and test datasets.

In [10]:
train, test = train_test_split(data, test_size=0.2, random_state = 1) #control the random sampling to get reproductible results. 
#We will be using the same distribution of data so we can control for the balancing techniques not to be biased by using different data sets. 

In [11]:
print("Labels for testing dataset:\n",test.label.value_counts(),"\n")
print("Labels for training dataset:\n",train.label.value_counts())

Labels for testing dataset:
 0    5955
1     438
Name: label, dtype: int64 

Labels for training dataset:
 0    23765
1     1804
Name: label, dtype: int64


Now the datasets will be transformed into lists so they can be used as parameters for the vectorizer. 

The labels are separated from the tweets.

In [12]:
train_x = list(train["processed_tweet"])
test_x = list(test["processed_tweet"])

train_y = list(train["label"])
test_y = list(test["label"])

Now a vectorizer will be fit using a dictionary of 5.000 words. It will be fit using the training data set and then will be used to transform the test dataset.

In [13]:
vocabulary_size = 5000
vectorizer = CountVectorizer(max_features=vocabulary_size)
features_train = vectorizer.fit_transform(train_x).toarray()
features_test = vectorizer.transform(test_x).toarray()
vocabulary = vectorizer.vocabulary_

In [14]:
train_x = features_train #Now the train and test dataframes X are not processed tweets by arrays of numbers. 
test_x = features_test

A validation set will be created out of the training set so it can be passed to the algorithm in sagemaker. 

In [15]:
len_val = int(len(train_x)*0.2) #the length of the validation dataset will be 20% of that of the whole training set.

val_x = pd.DataFrame(train_x[:len_val])
train_x = pd.DataFrame(train_x[len_val:])

val_y = pd.DataFrame(train_y[:len_val])
train_y = pd.DataFrame(train_y[len_val:])

A first directory will be created for data prepared. Several will be done with different balancing techniques.

In [16]:
version = "unbalanced"

In [17]:
data_dir = f"data_prepared_{version}" #a new folder will be created keeping in mind the version of the different balancing of the data.
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [18]:
#all data frames will be saved in the newly created folder.

pd.DataFrame(test_x).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False) 
pd.DataFrame(test_y).to_csv(os.path.join(data_dir, 'test_y.csv'), header=False, index=False)

pd.concat([val_y, val_x], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([train_y, train_x], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [19]:
test_X = train_X = val_X = train_y = val_y = None 
#given the amount of resources the dataframes take, it is a good idea to delete them from the Ram after they have been permanently saved.

Finally, the training and validation as well as the test (the X part) will be ppload the data to S3 storage. From there they are to be called to be used in the training of the model.

In [20]:
import sagemaker #The sagemaker module will be used to upload the data to s3.

session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

#the location of the information inside of S3 is saved so it can be referenced later. It is saved as string at the same time as is uploaded.
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix) 
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)

#The location for all the data is permanently saved as a file inside the folder so it can be accessed by other modules.
locations = "%s = {'test': '%s','train': '%s','val':'%s'}" % (version,test_location,train_location,val_location) #creating a text for a definition of a variable in python

with open("data/s3_folders.py", "w") as file:
    file.write(locations)
    file.write("\n")

### 2. Undersampling technique.

In [21]:
train, test = train_test_split(data, test_size=0.2, random_state = 1) #control the random sampling to get reproductible results. 
#We will be using the same distribution of data so we can control for the balancing techniques not to be biased by using different data sets. 

In [22]:
print("Labels for testing dataset:\n",test.label.value_counts(),"\n")
print("Labels for training dataset:\n",train.label.value_counts())

Labels for testing dataset:
 0    5955
1     438
Name: label, dtype: int64 

Labels for training dataset:
 0    23765
1     1804
Name: label, dtype: int64


Now, for the balancing, we start by dividing the training data into the different classes.

In [23]:
normal = train[train.label==0]
violent = train[train.label==1]

In [24]:
normal_underSampled = resample(normal, #we will resample the normal tweets
                              replace = False, #we set replacement to false, since we are downsizing the data, is better to avoid repeating the observations
                               n_samples = len(violent), #we match the new sample to the length of the violent sample
                               random_state = 1) # we keep reproducible results

In [25]:
train = pd.concat([violent, normal_underSampled])
print("New labels for training dataset:\n",train.label.value_counts())

New labels for training dataset:
 1    1804
0    1804
Name: label, dtype: int64


Now that the datasets are balanced, they will be transformed into lists so they can be used as parameters for the vectorizer. 

The labels are separated from the tweets.

In [26]:
train_x = list(train["processed_tweet"])
test_x = list(test["processed_tweet"])

train_y = list(train["label"])
test_y = list(test["label"])

Now a vectorizer will be fit using a dictionary of 5.000 words. It will be fit using the training data set and then will be used to transform the test dataset.

In [27]:
vocabulary_size = 5000
vectorizer = CountVectorizer(max_features=vocabulary_size)
features_train = vectorizer.fit_transform(train_x).toarray()
features_test = vectorizer.transform(test_x).toarray()
vocabulary = vectorizer.vocabulary_

In [28]:
train_x = features_train #Now the train and test dataframes X are not processed tweets by arrays of numbers. 
test_x = features_test

A validation set will be created out of the training set so it can be passed to the algorithm in sagemaker. 

In [29]:
len_val = int(len(train_x)*0.2) #the length of the validation dataset will be 20% of that of the whole training set.

val_x = pd.DataFrame(train_x[:len_val])
train_x = pd.DataFrame(train_x[len_val:])

val_y = pd.DataFrame(train_y[:len_val])
train_y = pd.DataFrame(train_y[len_val:])

A first directory will be created for data prepared. Several will be done with different balancing techniques.

In [30]:
version = "underSample" #this version is the undersampled.

In [31]:
data_dir = f"data_prepared_{version}" #a new folder will be created keeping in mind the version of the different balancing of the data.
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [32]:
#all data frames will be saved in the newly created folder.

pd.DataFrame(test_x).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False) 
pd.DataFrame(test_y).to_csv(os.path.join(data_dir, 'test_y.csv'), header=False, index=False)

pd.concat([val_y, val_x], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)
pd.concat([train_y, train_x], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [33]:
test_X = train_X = val_X = train_y = val_y = None 
#given the amount of resources the dataframes take, it is a good idea to delete them from the Ram after they have been permanently saved.

Finally, the training and validation as well as the test (the X part) will be ppload the data to S3 storage. From there they are to be called to be used in the training of the model.

In [34]:
import sagemaker #The sagemaker module will be used to upload the data to s3.

session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

#the location of the information inside of S3 is saved so it can be referenced later. It is saved as string at the same time as is uploaded.
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix) 
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)


locations = "%s = {'test': '%s','train': '%s','val':'%s'}" % (version,test_location,train_location,val_location) #creating a text for a definition of a variable in python

with open("data/s3_folders.py", "a+") as file:
    file.write(locations)
    file.write("\n")

### 3. OverSampling technique.

In [35]:
train, test = train_test_split(data, test_size=0.2, random_state = 1) #control the random sampling to get reproductible results. 
#We will be using the same distribution of data so we can control for the balancing techniques not to be biased by using different data sets. 

In [36]:
print("Labels for testing dataset:\n",test.label.value_counts(),"\n")
print("Labels for training dataset:\n",train.label.value_counts())

Labels for testing dataset:
 0    5955
1     438
Name: label, dtype: int64 

Labels for training dataset:
 0    23765
1     1804
Name: label, dtype: int64


Now, for the balancing, we start by dividing the training data into the different classes.

In [37]:
normal = train[train.label==0]
violent = train[train.label==1]

In [38]:
violent_overSampled = resample(violent, #we will resample the violent tweets
                              replace = True, #we set replacement to False, since we need to resample several times the same observations
                               n_samples = len(normal), #we match the new sample to the length of the normal sample
                               random_state = 1) # we keep reproducible results

In [39]:
train = pd.concat([normal, violent_overSampled])
print("New labels for training dataset:\n",train.label.value_counts())

New labels for training dataset:
 1    23765
0    23765
Name: label, dtype: int64


Now that the datasets are balanced, they will be transformed into lists so they can be used as parameters for the vectorizer. 

The labels are separated from the tweets.

In [40]:
train_x = list(train["processed_tweet"])
test_x = list(test["processed_tweet"])

train_y = list(train["label"])
test_y = list(test["label"])

Now a vectorizer will be fit using a dictionary of 5.000 words. It will be fit using the training data set and then will be used to transform the test dataset.

In [41]:
vocabulary_size = 5000
vectorizer = CountVectorizer(max_features=vocabulary_size)
features_train = vectorizer.fit_transform(train_x).toarray()
features_test = vectorizer.transform(test_x).toarray()
vocabulary = vectorizer.vocabulary_

In [42]:
train_x = features_train #Now the train and test dataframes X are not processed tweets by arrays of numbers. 
test_x = features_test

A validation set will be created out of the training set so it can be passed to the algorithm in sagemaker. 

In [43]:
len_val = int(len(train_x)*0.2) #the length of the validation dataset will be 20% of that of the whole training set.

val_x = pd.DataFrame(train_x[:len_val])
train_x = pd.DataFrame(train_x[len_val:])

val_y = pd.DataFrame(train_y[:len_val])
train_y = pd.DataFrame(train_y[len_val:])

A first directory will be created for data prepared. Several will be done with different balancing techniques.

In [44]:
version = "overSample" #this version is the oversampled.

In [45]:
data_dir = f"data_prepared_{version}" #a new folder will be created keeping in mind the version of the different balancing of the data.
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [46]:
#all data frames will be saved in the newly created folder.

pd.DataFrame(test_x).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False) 
pd.DataFrame(test_y).to_csv(os.path.join(data_dir, 'test_y.csv'), header=False, index=False)

pd.concat([val_y, val_x], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)

In [47]:
pd.concat([train_y, train_x], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [48]:
test_X = train_X = val_X = train_y = val_y = None 
#given the amount of resources the dataframes take, it is a good idea to delete them from the Ram after they have been permanently saved.

Finally, the training and validation as well as the test (the X part) will be ppload the data to S3 storage. From there they are to be called to be used in the training of the model.

In [49]:
import sagemaker #The sagemaker module will be used to upload the data to s3.

session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

#the location of the information inside of S3 is saved so it can be referenced later. It is saved as string at the same time as is uploaded.
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix) 
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)


locations = "%s = {'test': '%s','train': '%s','val':'%s'}" % (version,test_location,train_location,val_location) #creating a text for a definition of a variable in python

with open("data/s3_folders.py", "a+") as file:
    file.write(locations)
    file.write("\n")

### 4. Combined technique.

In [4]:
train, test = train_test_split(data, test_size=0.2, random_state = 1) #control the random sampling to get reproductible results. 
#We will be using the same distribution of data so we can control for the balancing techniques not to be biased by using different data sets. 

In [5]:
print("Labels for testing dataset:\n",test.label.value_counts(),"\n")
print("Labels for training dataset:\n",train.label.value_counts())

Labels for testing dataset:
 0    5955
1     438
Name: label, dtype: int64 

Labels for training dataset:
 0    23765
1     1804
Name: label, dtype: int64


Now, for the balancing, we start by dividing the training data into the different classes.

In [6]:
normal = train[train.label==0]
violent = train[train.label==1]

For the "combined" approach we "split the difference" between the size of the samples. For this we find the size from getting the difference between the samples, then adding half the difference to the length of the "violent" class.

In [7]:
sample_size = int(len(violent) + ((len(normal)-len(violent))/2))
sample_size

12784

In [8]:
violent_overSampled = resample(violent, #we will resample the violent tweets
                              replace = True, #we set replacement to False, since we need to resample several times the same observations
                               n_samples = sample_size, #we match both samples
                               random_state = 1) # we keep reproducible results

normal_underSampled = resample(normal, #we will resample the normal tweets
                              replace = False, #we set replacement to false, since we are downsizing the data, is better to avoid repeating the observations
                               n_samples = sample_size, #we match both samples
                               random_state = 1) # we keep reproducible results

In [9]:
train = pd.concat([normal_underSampled, violent_overSampled])
print("New labels for training dataset:\n",train.label.value_counts())

New labels for training dataset:
 1    12784
0    12784
Name: label, dtype: int64


Now that the datasets are balanced, they will be transformed into lists so they can be used as parameters for the vectorizer. 

The labels are separated from the tweets.

In [10]:
train_x = list(train["processed_tweet"])
test_x = list(test["processed_tweet"])

train_y = list(train["label"])
test_y = list(test["label"])

Now a vectorizer will be fit using a dictionary of 5.000 words. It will be fit using the training data set and then will be used to transform the test dataset.

In [11]:
vocabulary_size = 5000
vectorizer = CountVectorizer(max_features=vocabulary_size)
features_train = vectorizer.fit_transform(train_x).toarray()
features_test = vectorizer.transform(test_x).toarray()
vocabulary = vectorizer.vocabulary_

We will save the vocabulary in a json file so it can ba accessed later to process new tweets. 

In [20]:
import json
for i in vocabulary:
    vocabulary[i] = vocabulary[i].item()

with open("data/vocabulary.json", "a+") as file:
    json.dump(vocabulary,file)

In [58]:
train_x = features_train #Now the train and test dataframes X are not processed tweets by arrays of numbers. 
test_x = features_test

A validation set will be created out of the training set so it can be passed to the algorithm in sagemaker. 

In [59]:
len_val = int(len(train_x)*0.2) #the length of the validation dataset will be 20% of that of the whole training set.

val_x = pd.DataFrame(train_x[:len_val])
train_x = pd.DataFrame(train_x[len_val:])

val_y = pd.DataFrame(train_y[:len_val])
train_y = pd.DataFrame(train_y[len_val:])

A first directory will be created for data prepared. Several will be done with different balancing techniques.

In [60]:
version = "combined" #this version is the oversampled.

In [61]:
data_dir = f"data_prepared_{version}" #a new folder will be created keeping in mind the version of the different balancing of the data.
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

In [62]:
#all data frames will be saved in the newly created folder.

pd.DataFrame(test_x).to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False) 
pd.DataFrame(test_y).to_csv(os.path.join(data_dir, 'test_y.csv'), header=False, index=False)

pd.concat([val_y, val_x], axis=1).to_csv(os.path.join(data_dir, 'validation.csv'), header=False, index=False)

In [63]:
pd.concat([train_y, train_x], axis=1).to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)

In [64]:
test_X = train_X = val_X = train_y = val_y = None 
#given the amount of resources the dataframes take, it is a good idea to delete them from the Ram after they have been permanently saved.

Finally, the training and validation as well as the test (the X part) will be ppload the data to S3 storage. From there they are to be called to be used in the training of the model.

In [65]:
import sagemaker #The sagemaker module will be used to upload the data to s3.

session = sagemaker.Session() # Store the current SageMaker session

# S3 prefix (which folder will we use)
prefix = f'twitter_sentiment_{version}'

#the location of the information inside of S3 is saved so it can be referenced later. It is saved as string at the same time as is uploaded.
test_location = session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix) 
val_location = session.upload_data(os.path.join(data_dir, 'validation.csv'), key_prefix=prefix)
train_location = session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)


locations = "%s = {'test': '%s','train': '%s','val':'%s'}" % (version,test_location,train_location,val_location) #creating a text for a definition of a variable in python

with open("data/s3_folders.py", "a+") as file:
    file.write(locations)
    file.write("\n")