# Dataset Prep

Following section will be on data preparation

In [None]:
import pandas as pd
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [None]:
train_df = pd.read_csv('https://raw.githubusercontent.com/Allenfp/DepressionDetectionNLP/master/final_training_data.txt', sep='\n')
train_df.head()

In [None]:
suicide_phrases = ['sw']
casual_phrases = ['cc']
suicide_pattern = [nlp(text) for text in suicide_phrases]
casual_pattern = [nlp(text) for text in casual_phrases]
matcher.add('SuicideWatch', None, *suicide_pattern)
matcher.add('CasualConversation', None, *casual_pattern)

In [None]:
labels = []

for text in train_df:
  sent = train_df[text]
  for idx in range(len(sent)):
    label = sent[idx][-3:]
    labels.append(label[-2:])
    sent[idx] = sent[idx][:-3]

train_df['Labels'] = labels

In [None]:
train_df

In [None]:
train_df['Labels'].replace({
    'sw': 1,
    'cc': 0
}, inplace=True)

# Data Format Prep

Change data format to .tsv for BERT feeding, as BERT is more familiar to this format

In [None]:
train_df_bert = pd.DataFrame({
    'id': range(len(train_df)),
    'label': train_df['Labels'],
    'alpha': ['a']*train_df.shape[0],
    'text': train_df['combined\tsubreddit'].replace(r'\n', ' ', regex=True)
})

train_df_bert.head()

In [None]:
train_df_bert.to_csv('train.tsv', sep='\t', index=False, header=False)

# Env Prep
Setup the environment for BERT

For this step, it is important to add GPU or TPU on the notebook by going through the toolbar in this particular order `Edit > Notebook Settings > Add accelerator > GPU`

In [None]:
import tensorflow as tf

In [None]:
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print(f'Found GPU at: {device_name}')

# BERT Prep

## Preparation for BERT

This includes installing and importing some dependencies, connects to google drive, and to load locally located files

In [None]:
# install
!pip install pytorch-pretrained-bert pytorch-nlp
!pip install transformers

# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

**Use the following part to load summarized text from local drive.**


---


* `train_summed.tsv`
>  Summarize all texts to min length of 100 with ratio of 0.5


* `train_summed(2).tsv`
> Summarize long texts to min length of 20 with ratio of 0.5


* `train_summed(4).tsv`
> Summarize long texts to min length of 100 with ratio of 0.5 
>> _Fails as this contains 13 text considered too lengthy and contains 83 NaN_


* `train_summed(5).tsv`
> Summarize long texts to min length of 20 with ratio of 0.4
>> _Fails as this contains 14 NaN_


* `train_summed(6).tsv`
> Summarize long texts to min length of 20 with ratio of 0.3
>> _Fails as this contains 14 NaN_


* `train_summed(7).tsv`
> Summarize long texts to min length of 20 with ratio of 0.3
>> _Fails as this contains 14 NaN_


* `train_summed(8).tsv`
> Summarize long texts to min length of 20 with ratio of 0.1

* `train_summed(9).tsv`
> Summarize long texts to min length of 20 with ration of 0.9

---




In [None]:
from google.colab import files

In [None]:
uploaded = files.upload()

In [None]:
train_df_bert = pd.read_csv('train_summed(5).tsv', sep='\t', header=None)

In [None]:
train_df_bert.rename(columns={0: 'id', 1: 'label', 2: 'alpha', 3: 'text'}, inplace = True)

In [None]:
train_df_bert

In [None]:
train_df_bert = train_df_bert.dropna()

## Summarization

In [None]:
!pip install bert-extractive-summarizer
!pip install spacy
!pip install transformers==2.2.0

In [None]:
from summarizer import Summarizer

model = Summarizer()

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
for idx in range(len(train_df_bert)):
  print(idx)
  if len(train_df_bert.loc[idx, 'text']) > 510:
    print(f'Token count: {len(train_df_bert.loc[idx, "text"])}')
    print(f"Pre = {train_df_bert.loc[idx, 'text']}")
    result = model(train_df_bert.loc[idx, 'text'], ratio = 0.9)
    train_df_bert.at[idx, 'text'] = result
    print(f"Post= {train_df_bert.loc[idx, 'text']}")

In [None]:
train_df_bert.to_csv('train_summed(9).tsv', sep='\t', index=False, header=False)

## Tokenization

This part includes tokenization using `BertTokenizer`

Note that since some sentence in our dataset contains more than 512 tokens, we offer some options to handle this issue, those are:


*   Stopwords Removal
*   Slicing and Truncation of Texts



In [None]:
from transformers import BertTokenizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
nlp = English()

In [None]:
train_df_bert = train_df_bert.dropna()

In [None]:
labels = []
for lbl in train_df_bert.label.values:
  labels.append(lbl)

In [None]:
train_df_bert

### Without Stopword Removal

In [None]:
#Without stopwords removal

input_ids = []
lengths = []

for (idx, sen) in enumerate(train_df_bert.text.values, start=0):
  # print(sen)
  encoded_sent = tokenizer.encode(
      sen,
      add_special_tokens = True
      #max_length = 512
  )

  input_ids.append(encoded_sent)
  lengths.append(len(encoded_sent))

print(f'{len(input_ids):<10} comments')

## Text Slicing and Truncation

According to the paper https://arxiv.org/pdf/1905.05583.pdf, there are 3 ways we can deal with long text for BERT.

*Check section 5.3 in the paper for enclosure.

### First 512 Tokens

In [None]:
for id in input_ids:
  if len(id) > 510:
    head = id[:509]
    tail = id[:-1]
    id = head + tail

### Combination of First and Last

In [None]:
print(f'{len(input_ids):<10} comments')
for id in input_ids:
  if len(id) > 510:
    head = id[:128]
    tail = id[-382:]
    id = head + tail

### Last 512 Tokens

In [None]:
for id in input_ids:
  if len(id) > 510:
    head = id[0]
    tail = id[-509:]
    id = head + tail

## Pad Texts

In [None]:
MAX_LEN = 512
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")

## Verification Point

The following codes' purpose is only to verify our data. This section focus only on checking whether the data is already compatible with the format that BERT prefers

In [None]:
print(f'{np.sum(train_df_bert.label)} depressed')
print(f'{len(train_df_bert.label) - np.sum(train_df_bert.label)} casual')

In [None]:
print(f'Min length: {min(lengths)} tokens')
print(f'Max length: {max(lengths)} tokens')
print(f'Med length: {np.median(lengths)} tokens')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams['figure.figsize'] = (10,5)

lengths = [min(l, 512) for l in lengths]
sns.distplot(lengths, kde=False, rug=False)

plt.title('Comment lengths')
plt.xlabel('Comment length')
plt.ylabel('# of comments')

In [None]:
num_truncated = lengths.count(512)

num_sentences = len(lengths)
prnct = float(num_truncated) / float(num_sentences)

print(f'{num_truncated} of {num_sentences} sentences ({prnct:.1%}) are longer than 512 tokens')

In [None]:
num_sw = 0
num_cc = 0

for i, l in enumerate(lengths):
  if l == 512:
    if train_df_bert.label[l] == 1:
      num_sw+=1
    else:
      num_cc+=1

print(f'{num_sw} comments contains depressed text, the rest {num_cc} are OK')

### Train Attention Mask

In [None]:
## KFold

attention_masks = []

for sent in input_ids:
  att_mask = [int(token_id > 0) for token_id in sent]
  attention_masks.append(att_mask)

# BERT BERT BERT

In [None]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

model.cuda()

In [None]:
torch.cuda.empty_cache()

In [None]:
params = list(model.named_parameters())

print(f'The BERT model has {len(params)} different named parameters.\n')

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print(f"{p[0]:<55} {str(tuple(p[1].size())):>12}")

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print(f"{p[0]:<55} {str(tuple(p[1].size())):>12}")

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print(f"{p[0]:<55} {str(tuple(p[1].size())):>12}")

In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 2
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.1, num_training_steps=total_steps)

## Training

In [None]:
# Accuracy calculator
def flat_accuracy(preds, labels):
  pred_flat = np.argmax(preds, axis=1).flatten()
  labels_flat = labels.flatten()
  return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
import time
import datetime

# Time counter
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))
  return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
## K-fold
import random
from torch.utils.tensorboard import SummaryWriter
from sklearn.model_selection import KFold


writer = SummaryWriter()

# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128


# Set the seed value all over the place to make this reproducible.
epochs = 2
seed_val = 42
# accumulation_steps = 24
batch_size = 8
train_loss_set = []

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
training_stats = []
kfold = KFold(n_splits=5)
    
# ========================================
#               Training
# ========================================

# Perform one full pass over the training set.

print("")
print('Training...')

# Measure how long the training epoch takes.
t0 = time.time()

# Reset the total loss for this epoch.
total_train_loss = 0
total_train_accuracy = 0

for fold, (train_index, test_index) in enumerate(kfold.split(input_ids, attention_masks)):

  ### Dividing data into folds
  training_input = input_ids[train_index]
  validation_input = input_ids[test_index]
  training_labels = tuple(labels[j] for i,j in enumerate(train_index))
  validation_labels = labels[test_index[0]:test_index[-1]+1]

  # print(train_attention_masks)
  # print(train_attention_masks[train_index[0]])

  training_masks = tuple(attention_masks[j] for i,j in enumerate(train_index))
  validation_masks = attention_masks[test_index[0]:test_index[-1]+1]

  training_inputs = torch.tensor(training_input)
  validation_inputs = torch.tensor(validation_input)
  training_labels = torch.tensor(training_labels)
  validation_labels = torch.tensor(validation_labels)
  training_masks = torch.tensor(training_masks)
  validation_masks = torch.tensor(validation_masks)


  train_data = TensorDataset(training_inputs, training_masks, training_labels)
  train_sampler = RandomSampler(train_data)
  train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
  validation_sampler = SequentialSampler(validation_data)
  validation_loader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

  print('\nFold number {} / {}'.format(fold + 1 , kfold.get_n_splits()))

  # Put the model into training mode. Don't be mislead--the call to 
  # `train` just changes the *mode*, it doesn't *perform* the training.
  # `dropout` and `batchnorm` layers behave differently during training
  # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
  for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_loader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_loader), elapsed))

        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        output = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)

        loss = output[0]
        logits = output[1]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_train_accuracy += flat_accuracy(logits, label_ids)

        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_train_loss += loss.item()

        # Perform a backward pass to calculate the gradients.
        loss.backward()

        # if (step+1) % accumulation_steps == 0:        # Wait for several backward steps
        print(f'Loss on step #{step}: {loss.item()}')
        train_loss_set.append(loss.item())

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()

        # # Update the learning rate.
        scheduler.step()

    # Calculate the average accuracy over the training data.
    avg_train_accuracy = total_train_accuracy / len(train_loader)

    # Calculate the average loss over the training data.
    avg_train_loss = total_train_loss / len(train_loader)            

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Average training accuracy: {0:.2f}".format(avg_train_accuracy))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))

    writer.add_scalar('Loss/train', avg_train_loss, epoch_i)
    writer.add_scalar('Accuracy/train', avg_train_accuracy, epoch_i)
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    print("")
    print("Running Validation...")

    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()

    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0

    # Evaluate data for one epoch
    for batch in validation_loader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        

            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask,
                            labels=b_labels)

        loss = outputs[0]
        logits = outputs[1]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        eval_loss += loss.item()

        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy

        # Track the number of batches
        nb_eval_steps += 1

    avg_val_loss = eval_loss/nb_eval_steps
    avg_val_accuracy = eval_accuracy/nb_eval_steps

    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    print("  Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

    writer.add_scalar('Loss/val', avg_val_loss, epoch_i)
    writer.add_scalar('Accuracy/val', avg_val_accuracy, epoch_i)

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Training Accur.': avg_train_accuracy,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy
        }
    )

print("")
print("Training complete!")

In [None]:
!pip install tensorboard

In [None]:
%load_ext tensorboard

In [None]:
tensorboard --logdir runs --port=6007

In [None]:
!kill 827

In [None]:
import pandas as pd

# Display floats with two decimal places.
pd.set_option('precision', 6)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

## Testing

### Control

In [None]:
from google.colab import files

In [None]:
uploaded = files.upload()

In [None]:
test_df_control = pd.read_csv('twitter_test(2).csv', sep='\t', encoding='unicode_escape')

In [None]:
df_test = test_df_control.rename(columns = {'0': 'tweet'}, inplace = False)

In [None]:
from sklearn.utils import shuffle
df_test = shuffle(df_test)

In [None]:
df_test

In [None]:
import pandas as pd

print('Number of test sentences: {:,}\n'.format(df_test.shape[0]))

# Create sentence and label lists
sentences = df_test.tweet.values
labels = df_test.label.values

# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

# For every sentence...
for sent in sentences:
    # `encode_plus` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.    
    input_ids.append(encoded_dict['input_ids'])
    
    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Set the batch size.  
batch_size = 32  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
#Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []
test_accuracy = 0
nb_test_steps = 0

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

   # Calculate the accuracy for this batch of test sentences.
  tmp_test_accuracy = flat_accuracy(logits, label_ids)
  
  # Accumulate the total accuracy.
  test_accuracy += tmp_test_accuracy

  nb_test_steps += 1



  # print(f'Temp: {tmp_test_accuracy}')
  # print(f'Curr: {test_accuracy}')

print(f'Accuracy: {test_accuracy/nb_test_steps}')
print('    DONE.')

In [None]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [None]:
from sklearn.metrics import classification_report

In [None]:
class_names = ['Control', 'Depressed']

In [None]:
print(classification_report(flat_true_labels, flat_predictions, target_names=class_names, digits=4))

# XLNet2


In [None]:
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
!pip install pytorch-transformers
!pip install transformers
!pip install torch

In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


from pytorch_transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from pytorch_transformers import AdamW

from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df = pd.read_csv("train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [None]:
train_df = df.iloc[:3000]
test_df = df.iloc[3000:]

In [None]:
from sklearn.utils import shuffle

train_df = shuffle(train_df)
test_df = shuffle(test_df)

In [None]:
train_df.shape

In [None]:
train_df.sample(10)

In [None]:
train_df = train_df.dropna()
test_df = test_df.dropna()

In [None]:
train_sentences = train_df.sentence.values
test_sentences = test_df.sentence.values

In [None]:
train_sentences = [sentence + " <SEP><CLS>" for sentence in train_sentences]
train_labels = train_df.label.values

test_sentences = [sentence + " <SEP><CLS>" for sentence in test_sentences]
test_labels = test_df.label.values

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

In [None]:
train_tokenized_texts = [tokenizer.tokenize(sent) for sent in train_sentences]
test_tokenized_texts = [tokenizer.tokenize(sent) for sent in test_sentences]

In [None]:
train_tokenized_texts[0]

In [None]:
MAX_LEN = 512

In [None]:
# Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
train_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in train_tokenized_texts]
test_input_ids = [tokenizer.convert_tokens_to_ids(x) for x in test_tokenized_texts]

In [None]:
# Pad our input tokens
train_input_ids = pad_sequences(train_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
test_input_ids = pad_sequences(test_input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
# Create attention masks
train_attention_masks = []
test_attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in train_input_ids:
  seq_mask = [float(i>0) for i in seq]
  train_attention_masks.append(seq_mask)

print(len(train_attention_masks))
for seq in test_input_ids:
  seq_mask = [float(i>0) for i in seq]
  test_attention_masks.append(seq_mask)

In [None]:
# Use train_test_split to split our data into train and validation sets for training

training_inputs, validation_inputs, training_labels, validation_labels = train_test_split(train_input_ids, train_labels, 
                                                            random_state=2018, test_size=0.1)
training_masks, validation_masks, _, _ = train_test_split(train_attention_masks, train_input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

training_inputs = torch.tensor(training_inputs)
validation_inputs = torch.tensor(validation_inputs)
training_labels = torch.tensor(training_labels)
validation_labels = torch.tensor(validation_labels)
training_masks = torch.tensor(training_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# # Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 4

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(training_inputs, training_masks, training_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [None]:
# Load XLNEtForSequenceClassification, the pretrained XLNet model with a single linear classification layer on top. 

from transformers import XLNetForSequenceClassification
model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels = 2)
model.cuda()

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters,
                     lr=5e-5)

In [None]:
import time
import datetime

# Time counter
def format_time(elapsed):
  elapsed_rounded = int(round((elapsed)))
  return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
from sklearn.model_selection import KFold

total_acc = 0
train_loss_set = []
val_loss = []
val_acc = []
epochs = 4
batch_size = 8
accumulation_steps = 24
predictions = []
true_labels = []

kfold = KFold(n_splits=5)

for fold, (train_index, test_index) in enumerate(kfold.split(train_input_ids, train_attention_masks)):

    ### Dividing data into folds
    training_input = train_input_ids[train_index]
    validation_input = train_input_ids[test_index]
    training_labels = train_labels[train_index]
    validation_labels = train_labels[test_index]

    # print(train_attention_masks)
    # print(train_attention_masks[train_index[0]])

    training_masks = tuple(train_attention_masks[j] for i,j in enumerate(train_index))
    validation_masks = train_attention_masks[test_index[0]:test_index[-1]+1]

    print(test_index[-1])
    print(test_index[-1]+1)

    training_inputs = torch.tensor(training_input)
    validation_inputs = torch.tensor(validation_input)
    training_labels = torch.tensor(training_labels)
    validation_labels = torch.tensor(validation_labels)
    training_masks = torch.tensor(training_masks)
    validation_masks = torch.tensor(validation_masks)


    train_data = TensorDataset(training_inputs, training_masks, training_labels)
    train_sampler = RandomSampler(train_data)
    train_loader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_loader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    for _ in trange(epochs, desc='Epoch'):
      t0 = time.time()
      print('\nFold number {} / {}'.format(fold + 1 , kfold.get_n_splits()))
      model.train()
      for step, x_batch in enumerate(train_loader):
        # Add batch to GPU
        x_batch = tuple(t.to(device) for t in x_batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = x_batch
        
        optimizer.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        logits = outputs[1]

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()


        train_loss_set.append(loss.item())    

        # Backward pass
        loss.backward()

        if (step+1) % accumulation_steps == 0:        # Wait for several backward steps
          optimizer.step()                            # Now we can do an optimizer step
          model.zero_grad()                           # Reset gradients tensors  

          # print(f'Loss on step #{step}: {loss.item()}')
          # train_loss_set.append(loss.item())

      # Validation

      # Put model in evaluation mode to evaluate loss on the validation set
      model.eval()

      # Tracking variables 
      eval_loss, eval_accuracy = 0, 0
      nb_eval_steps, nb_eval_examples = 0, 0

      # Evaluate data for one epoch
      for batch in validation_loader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
          logits = output[1]
          loss = output[0]
          # logits = output[0]
      
      # Move logits and labels to CPU
      logits = logits.detach().cpu().numpy()
      label_ids = b_labels.to('cpu').numpy()

      predictions.append(logits)
      true_labels.append(label_ids)

      eval_loss += loss.item()

      tmp_eval_accuracy = flat_accuracy(logits, label_ids)
      
      eval_accuracy += tmp_eval_accuracy
      nb_eval_steps += 1

      print("Validation Accuracy: {}".format(loss.item()))

In [None]:
plt.figure(figsize=(15,8))
plt.title('Loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.plot(train_loss_set)
plt.show()

In [None]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [None]:
from sklearn.metrics import classification_report

class_names = ['Control', 'Depressed']

print(classification_report(flat_true_labels, flat_predictions, target_names=class_names, digits=4))

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
df_test = pd.read_csv('twitter_test(2).csv', sep='\t', encoding='unicode_escape')

In [None]:
from sklearn.utils import shuffle
df_test = shuffle(df_test)

In [None]:
df_test

In [None]:
model.cuda()

In [None]:
sentences = df_test.tweet.values
sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
labels = df_test.label.values
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


MAX_LEN = 512
# Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
  
batch_size = 8


prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

In [None]:
predictions

In [None]:
# Combine the results across all batches. 
flat_predictions = np.concatenate(predictions, axis=0)

# For each sample, pick the label (0 or 1) with the higher score.
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

In [None]:
from sklearn.metrics import classification_report

In [None]:
class_names = ['Control', 'Depressed']

In [None]:
print(classification_report(flat_true_labels, flat_predictions, target_names=class_names, digits=4))

# GloVe + CNN

In [None]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, Dense, Flatten, Concatenate, Input
from keras.layers.merge import concatenate
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
import pandas as pd

train_df = pd.read_csv('train.tsv', sep='\t', header=None)
train_df.rename(columns={0: 'id', 1: 'label', 2: 'alpha', 3: 'text'}, inplace = True)
train_df

test_df = pd.read_csv('twitter_test(2).csv', sep='\t', encoding='unicode_escape')

In [None]:
X = train_df['text']
Y = train_df['label']

X_test = test_df['tweet']
Y_test = test_df['label']

In [None]:
sum(Y_test)

In [None]:
vocab_size = 18124
oov_token = "<OOV>"
max_length = 2000
padding_type = "post"
trunction_type='post'

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index

In [None]:
X_train_sequences = tokenizer.texts_to_sequences(X)
X_train_padded = pad_sequences(X_train_sequences,maxlen=max_length, padding=padding_type, 
                         truncating=trunction_type)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding=padding_type,
                        truncating=trunction_type)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
embeddings_index = {}
f = open('/content/gdrive/My Drive/Thesis Models/GloVe/glove.6B.100d.txt', encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
EMBEDDING_DIM = 100
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=True)

In [None]:
from sklearn.model_selection import StratifiedKFold

num_epochs = 4
seed = 7
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
fold_no = 1
acc_per_fold = []
loss_per_fold = []

for train, test in kfold.split(X_train_padded, Y):
  # print(f'Fold Number: {fold_num}')
  # fold_num = fold_num+1
	# Fit the model
  inputs1 = Input(shape=(length,))
  embedding1 = embedding_layer(inputs1)
  drop1 = Dropout(0.2)(embedding1)
  conv1 = Conv1D(filters=128, kernel_size=3, activation='relu')(drop1)
  pool1 = MaxPooling1D(pool_size=3)(conv1)
  flat1 = Flatten()(pool1)
  # channel 2
  conv2 = Conv1D(filters=128, kernel_size=4, activation='relu')(drop1)
  pool2 = MaxPooling1D(pool_size=2)(conv2)
  flat2 = Flatten()(pool2)
  # channel 3
  conv3 = Conv1D(filters=128, kernel_size=5, activation='relu')(drop1)
  pool3 = MaxPooling1D(pool_size=5)(conv3)
  flat3 = Flatten()(pool3)
  # merge
  merged = concatenate([flat1, flat2, flat3])
  # interpretation
  dense1 = Dense(250, activation='relu')(merged)
  drop4 = Dropout(0.2)(dense1)
  outputs = Dense(1, activation='sigmoid')(drop4)
  model = Model(inputs=inputs1, outputs=outputs)
  model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')
  history = model.fit(X_train_padded[train], Y[train], epochs=num_epochs, verbose=1)
	# evaluate the model
  scores = model.evaluate(X_train_padded[test], Y[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])
  fold_no = fold_no + 1
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

In [None]:
yhat_probs = model.predict(X_test, verbose=0)
# predict crisp classes for test set
# yhat_classes = model.predict_classes(X_test, verbose=0)
yhat_classes = yhat_probs.argmax(axis=-1)

In [None]:
# reduce to 1d array
yhat_probs = yhat_probs[0:]

In [None]:
from sklearn.metrics import classification_report
class_names = ['Control', 'Depressed']

print(classification_report(y_test, (yhat_probs > 0.5).astype("int32"), target_names=class_names, digits=4))

In [None]:
score = model.evaluate([X_test_padded, X_test_padded, X_test_padded], Y_test, verbose=1)
print(score)

In [None]:
# predict probabilities for test set
yhat_probs = model.predict([X_test_padded, X_test_padded, X_test_padded])[0]
# predict crisp classes for test set
# yhat_classes = model.predict_classes([X_test_padded, X_test_padded, X_test_padded], verbose=0)
yhat_classes = np.argmax(yhat_probs)

In [None]:
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
# yhat_classes = yhat_classes[:, 0]

In [None]:
yhat_classes

In [None]:
from sklearn.metrics import classification_report
target_names = ['Normal', 'Depressed']
print(classification_report(Y_test, yhat_classes, target_names=target_names, digits=4))

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='rnn_model.png', show_shapes=True, show_layer_names=True)

#GloVe + LSTM

In [None]:
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Dense, Embedding, LSTM, Bidirectional
from sklearn.model_selection import train_test_split
import numpy as np

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
import pandas as pd

train_df = pd.read_csv('train.tsv', sep='\t', header=None)
train_df.rename(columns={0: 'id', 1: 'label', 2: 'alpha', 3: 'text'}, inplace = True)
train_df

test_df = pd.read_csv('twitter_test(2).csv', sep='\t', encoding='unicode_escape')

In [None]:
X = train_df['text']
Y = train_df['label']

X_test = test_df['tweet']
Y_test = test_df['label']

In [None]:
vocab_size = 40000
oov_token = "<OOV>"
max_length = 2000
padding_type = "post"
trunction_type='post'

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X)
word_index = tokenizer.word_index

In [None]:
X_train_sequences = tokenizer.texts_to_sequences(X)
X_train_padded = pad_sequences(X_train_sequences,maxlen=max_length, padding=padding_type, 
                         truncating=trunction_type)

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding=padding_type,
                        truncating=trunction_type)

In [None]:
embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embeddings_index['depression']

In [None]:
EMBEDDING_DIM = 100
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=max_length,
                            trainable=False)

In [None]:
# embedding_dim = 16
# input_length = 512

In [None]:
model = Sequential([
  # Input(shape=(max_length,), dtype='int32'),
    embedding_layer,
    Dropout(0.2),
    Bidirectional(LSTM(50)),
    Dropout(0.2),
    Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

In [None]:
from sklearn.model_selection import StratifiedKFold

num_epochs = 20
seed = 7
np.random.seed(seed)
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
cvscores = []
fold_num = 0
for train, test in kfold.split(X_train_padded, Y):
  # print(f'Fold Number: {fold_num}')
  # fold_num = fold_num+1
	# Fit the model
	history = model.fit(X_train_padded[train], Y[train], epochs=num_epochs, verbose=1)
	# evaluate the model
	scores = model.evaluate(X_train_padded[test], Y[test], verbose=1)
	print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
	cvscores.append(scores[1] * 100)
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

In [None]:
yhat_probs = model.predict(X_test, verbose=0)

In [None]:
print(classification_report(y_test, (yhat_probs > 0.5).astype("int32"), target_names=class_names, digits=4))

In [None]:
score = model.evaluate(X_test_padded, Y_test, verbose=1)
print(score)

In [None]:
# predict probabilities for test set
yhat_probs = model.predict(X_test_padded, verbose=0)
# predict crisp classes for test set
yhat_classes = model.predict_classes(X_test_padded, verbose=0)

In [None]:
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]

In [None]:
from sklearn.metrics import classification_report
target_names = ['Normal', 'Depressed']
print(classification_report(Y_test, Yhat_classes, target_names=target_names, digits=4))

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model, to_file='rnn_model.png', show_shapes=True, show_layer_names=True)

# Sent2Vec

In [None]:
!pip install sent2vec

In [None]:
from scipy import spatial
from sent2vec.vectorizer import Vectorizer

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
import pandas as pd

df = pd.read_csv('train_summed(6).tsv', sep='\t', header=None)
summed_df = pd.read_csv('train_summed(5).tsv', sep='\t', header=None)

In [None]:
df.sample(10)

In [None]:
summed_df.dropna()
df.dropna()

In [None]:
summed_df.sample(10)

In [None]:
tweets = df[3].values
summarize = summed_df[3].values

In [None]:
print(f'Sentences: {df.shape} ++++ Summarize: {summed_df.shape}')

In [None]:
tweets

In [None]:
import numpy as np

vectorizer = Vectorizer()
total_dist = 0
total_summarized = 0
summed6_vectors = []
summed5_vectors = []

for sent in summarize:
  try:
    index = np.where(summarize == sent)[0][0]
    if sent != tweets[index]:
      sentences = [
                  sent,
                  tweets[index]
      ]
      vectorizer.bert(sentences)
      vectors_bert = vectorizer.vectors
      summed6_vectors.append(vectors_bert[0])
      summed5_vectors.append(vectors_bert[1])

      dist = spatial.distance.cosine(vectors_bert[0], vectors_bert[1])
      if dist != 0:
        total_dist+=dist
        total_summarized+=1
      print(f'Cosine dist {index}: {dist}')
      # print(f'Data number: {index}')
  except:
    print('ERROR')

In [None]:
avg_cosine_dist = total_dist/total_summarized
print(f'Average cosine distance: {avg_cosine_dist}')

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
np.save('newer_summed(5)_vectors.npy', summed5_vectors)

In [None]:
vector_5 = np.load('newer_summed(5)_vectors.npy')
vector_6 = np.load('new_summed(6)_vectors.npy')

comparison = vector_5 == vector_6
# comparison = summed5_vectors == summed6_vectors
equal_arrays = comparison.all()

print(equal_arrays)

In [None]:
import numpy as np
import pandas as pd
# sent_vectors = np.load('sent_vectors.npy')
summed_vectors = np.load('summed(9)_vectors.npy')

In [None]:
# sentVec_df = pd.DataFrame(sent_vectors)
summedVec_df = pd.DataFrame(summed_vectors)
# sentences_df = pd.DataFrame(sentences_vec)

In [None]:
# print(sentVec_df.shape)
print(summedVec_df.shape)
# print(sentences_df.shape)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

standardized_data = StandardScaler().fit_transform(summedVec_df)
print(standardized_data.shape)

In [None]:
sample_data = standardized_data

covar_matrix = (np.matmul(sample_data.T, sample_data))/1577
print(f'The shape of our covar matrix is: {covar_matrix.shape}')

In [None]:
from scipy.linalg import eigh

values, vectors = eigh(covar_matrix, eigvals=(766,767))
print(values.shape)
print(values)
print(f"Shape of eigen vectors: {vectors.shape}")

vectors = vectors.T
print(f'Updated shape of eigen vectors: {vectors.shape}')

In [None]:
import matplotlib.pyplot as plt

new_coordinates = np.matmul(vectors, sample_data.T)

print(f'Resultantat new data points\'s shape {vectors.shape}, X {sample_data.T.shape} = {new_coordinates.shape}')

In [None]:
labels = pd.DataFrame(index=range(1), columns=range(1577))

In [None]:
labels.fillna(9)

In [None]:
import pandas as pd

new_coordinates = np.vstack((new_coordinates, labels)).T

dataframe_6 = pd.DataFrame(data = new_coordinates, columns=('1st Principal', '2nd Principal', 'label'))
dataframe_6.label = 9
print(dataframe_2.head())

In [None]:
import seaborn as sn
sn.FacetGrid(dataframe, hue='label', size=6).map(plt.scatter, '1st Principal', '2nd Principal').add_legend()
plt.show()

In [None]:
pca_dataframe = dataframe.drop(columns='label')

from sklearn.decomposition import PCA
pca = PCA(n_components=2, whiten=True).fit(pca_dataframe)
X_pca = pca.transform(pca_dataframe)
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=1).fit(X_pca)


centers = pca.inverse_transform(kmeans.cluster_centers_)
print(centers)

plt.scatter(centers[0][0], centers[0][1])

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.scatter(centers[:,0], centers[:,1])
n = ['Vanilla', 'Summed(2)', 'Summed(5)', 'Summed(6)', 'Summed(8)', 'Summed(9)']
n = ['Original', '50%', '40%', '30%', '10%', '90%']

for i, txt in enumerate(n):
    ax.annotate(txt, (centers[:,0][i], centers[:,1][i]))

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
centroids = np.load('centroids_vectors.npy')

In [None]:
import numpy as np

ratio_1 = np.load('sent_vectors.npy')
ratio_2 = np.load('summed(2)_vectors.npy')
ratio_5 = np.load('new_summed(5)_vectors.npy')
ratio_6 = np.load('new_summed(6)_vectors.npy')
ratio_8 = np.load('summed(8)_vectors.npy')
ratio_9 = np.load('summed(9)_vectors.npy')

In [None]:
x1 = dataframe_1['1st Principal']
y1 = dataframe_1['2nd Principal']

x2 = dataframe_2['1st Principal']
y2 = dataframe_2['2nd Principal']

x3 = dataframe_3['1st Principal']
y3 = dataframe_3['2nd Principal']

x4 = dataframe_4['1st Principal']
y4 = dataframe_4['2nd Principal']

x5 = dataframe_5['1st Principal']
y5 = dataframe_5['2nd Principal']

x6 = dataframe_6['1st Principal']
y6 = dataframe_6['2nd Principal']

In [None]:
vectors = np.vstack((dataframe_1, dataframe_2, dataframe_3, dataframe_4, dataframe_5, dataframe_6))
n = ['Vanilla', 'Summed(2)', 'Summed(5)', 'Summed(6)', 'Summed(8)', 'Summed(9)']

In [None]:
import numpy as np
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(16,16))

ratio_1 = ax.scatter(x1, y1, color='#1f77b4')
ratio_2 = ax.scatter(x2, y2, color='#ff7f0e')
ratio_3 = ax.scatter(x3, y3, color='#2ca02c')
ratio_4 = ax.scatter(x4, y4, color='#d62728')
ratio_5 = ax.scatter(x5, y5, color='#9467bd')
ratio_6 = ax.scatter(x6, y6, color='#8c564b')
ax.set_xlabel('1st Principal')
ax.set_ylabel('2nd Principal')
ax.set_title('scatter plot')

plt.legend((ratio_1, ratio_2, ratio_3, ratio_4, ratio_5, ratio_6),
           ('Vanilla', '50%', '40%', '30%', '10%', '90%'),
           scatterpoints=1,
           loc='lower left',
           ncol=3,
           fontsize=8)
plt.show()

In [None]:
import numpy as np

centroid_1 = np.load('centroid_sent.npy')
centroid_2 = np.load('centroid_summed(2).npy')
centroid_5 = np.load('centroid_summed(5).npy')
centroid_6 = np.load('centroid_summed(6).npy')
centroid_8 = np.load('centroid_summed(8).npy')
centroid_9 = np.load('centroid_summed(9).npy')

In [None]:
n = ['Vanilla', 'Summed(2)', 'Summed(5)', 'Summed(6)', 'Summed(8)', 'Summed(9)']
centers = np.vstack((centroid_1, centroid_2, centroid_5, centroid_6, centroid_8, centroid_9))

In [None]:
centroid_9 == centroid_8