In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [24]:
df = pd.read_csv('/content/20200325_counsel_chat.csv')

In [25]:
model_df = pd.DataFrame()
model_df['q_text'] = df.questionText
model_df['label'] = df.topic
model_df.head()

Unnamed: 0,q_text,label
0,I'm going through some things with my feelings...,depression
1,I'm going through some things with my feelings...,depression
2,I'm going through some things with my feelings...,depression
3,I'm going through some things with my feelings...,depression
4,I'm going through some things with my feelings...,depression


In [26]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler()

x_ros, y_ros = ros.fit_resample(
                    np.array(model_df['q_text']).reshape(-1, 1), 
                    np.array(model_df['label']).reshape(-1, 1)
)

In [27]:
x_ros.reshape(-1).shape

(10230,)

In [28]:
model_df_os = pd.DataFrame()
model_df_os['q_text'] = x_ros.reshape(-1)
model_df_os['label'] = y_ros
model_df_os.head()

Unnamed: 0,q_text,label
0,I'm going through some things with my feelings...,depression
1,I'm going through some things with my feelings...,depression
2,I'm going through some things with my feelings...,depression
3,I'm going through some things with my feelings...,depression
4,I'm going through some things with my feelings...,depression


In [29]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels = le.fit_transform(model_df_os.label)

In [30]:
model_df_os['label'] = labels
print(model_df_os.shape)
model_df_os.head()

(10230, 2)


Unnamed: 0,q_text,label
0,I'm going through some things with my feelings...,6
1,I'm going through some things with my feelings...,6
2,I'm going through some things with my feelings...,6
3,I'm going through some things with my feelings...,6
4,I'm going through some things with my feelings...,6


In [31]:
from sklearn.model_selection import train_test_split as tts
model_x_train, model_x_test, model_y_train, model_y_test = tts(np.array(model_df_os.q_text), np.array(model_df_os.label), test_size= 0.3, random_state= 42)

In [32]:
train_df = pd.DataFrame()
train_df['q_text'] = model_x_train
train_df['label'] = model_y_train

test_df = pd.DataFrame()
test_df['q_text'] = model_x_test
test_df['label'] = model_y_test

In [33]:
print(train_df.shape)
train_df.head()


(7161, 2)


Unnamed: 0,q_text,label
0,I have been having horrendous nightmares this ...,24
1,I would like to be able to have more positive ...,25
2,I have been having horrendous nightmares this ...,24
3,"Every time my partner gets angry for anything,...",8
4,I've been bullied for years and the teachers h...,4


In [34]:
print(train_df.shape)
train_df.head()

(7161, 2)


Unnamed: 0,q_text,label
0,I have been having horrendous nightmares this ...,24
1,I would like to be able to have more positive ...,25
2,I have been having horrendous nightmares this ...,24
3,"Every time my partner gets angry for anything,...",8
4,I've been bullied for years and the teachers h...,4


In [35]:
train_df.shape[0]

7161

In [36]:
prefix_train = []
for i in range(train_df.shape[0]):
  prefix_train.append('multi-class classification')

prefix_test = []
for i in range(test_df.shape[0]):
  prefix_test.append('multi-class classification')

In [37]:
train_df.insert(0,'prefix', prefix_train)

train_df.rename(columns = {'q_text':'input_text', 'label':'target_text'}, inplace = True)

print(train_df.shape)
train_df.head()

(7161, 3)


Unnamed: 0,prefix,input_text,target_text
0,multi-class classification,I have been having horrendous nightmares this ...,24
1,multi-class classification,I would like to be able to have more positive ...,25
2,multi-class classification,I have been having horrendous nightmares this ...,24
3,multi-class classification,"Every time my partner gets angry for anything,...",8
4,multi-class classification,I've been bullied for years and the teachers h...,4


In [38]:
test_df.insert(0,'prefix', prefix_test)

test_df.rename(columns = {'q_text':'input_text', 'label':'target_text'}, inplace = True)

print(test_df.shape)
test_df.head()

(3069, 3)


Unnamed: 0,prefix,input_text,target_text
0,multi-class classification,She has trouble falling and staying asleep and...,7
1,multi-class classification,I'm socially awkward. I've always want to be p...,2
2,multi-class classification,I was raped repeatedly when I was younger. I t...,2
3,multi-class classification,"This is my recovery, and I don't feel that it ...",0
4,multi-class classification,I have a relative who is in his twenties. He w...,0


In [39]:
train_df['target_text'] = train_df['target_text'].astype(str)
test_df['target_text'] = test_df['target_text'].astype(str)

In [40]:
type(train_df.target_text[0])

str

In [41]:
pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [42]:
import logging
from simpletransformers.t5 import T5Model, T5Args

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [43]:
t5_args = T5Args()
t5_args.num_train_epochs = 1
t5_args.no_save = True
t5_args.evaluate_generated_text = True
t5_args.evaluate_during_training = True
t5_args.evaluate_during_training_verbose = True

In [44]:
t5_model = T5Model("t5", "t5-base", args= t5_args)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [45]:
t5_model.train_model(train_df, eval_data= test_df)

  0%|          | 0/7161 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/896 [00:00<?, ?it/s]



  0%|          | 0/3069 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


Generating outputs:   0%|          | 0/384 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/3069 [00:00<?, ?it/s]

(896,
 {'global_step': [896],
  'eval_loss': [0.16805690678012297],
  'train_loss': [0.001491546630859375]})

In [46]:
# Evaluate the model
t5_result = t5_model.eval_model(test_df)


  0%|          | 0/3069 [00:00<?, ?it/s]

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


Running Evaluation:   0%|          | 0/384 [00:00<?, ?it/s]

Generating outputs:   0%|          | 0/384 [00:00<?, ?it/s]

Decoding outputs:   0%|          | 0/3069 [00:00<?, ?it/s]

In [47]:
t_df = test_df.drop(columns='target_text')

In [49]:
test_array = []
for i in range(t_df.shape[0]):
  test_array.append(t_df.prefix[i] + ': ' + t_df.input_text[i])

In [51]:
test_array[3]

"multi-class classification: This is my recovery, and I don't feel that it is okay for them to ask this. They told me it is policy due to the fact that I may run into a peer there. I am a peer counselor in the small community that I grew up in. I am in recovery myself. I was asked to come work for this organization after I was two years sober (I was in treatment in this organization). I work with mental health peers and run life skill groups. I don't work with substance peers."

In [52]:
preds_test = t5_model.predict(test_array)

Generating outputs:   0%|          | 0/384 [00:00<?, ?it/s]

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Decoding outputs:   0%|          | 0/3069 [00:00<?, ?it/s]

In [54]:
preds_test = [int(i) for i in preds_test]
preds_test[0]

7

In [55]:
original_test_labels = [int(i) for i in test_df.target_text]
original_test_labels[0]

7

In [56]:
le.classes_

array(['addiction', 'anger-management', 'anxiety', 'behavioral-change',
       'children-adolescents', 'counseling-fundamentals', 'depression',
       'diagnosis', 'domestic-violence', 'eating-disorders',
       'family-conflict', 'grief-and-loss', 'human-sexuality', 'intimacy',
       'legal-regulatory', 'lgbtq', 'marriage', 'military-issues',
       'parenting', 'professional-ethics', 'relationship-dissolution',
       'relationships', 'self-esteem', 'self-harm', 'sleep-improvement',
       'social-relationships', 'spirituality', 'stress',
       'substance-abuse', 'trauma', 'workplace-relationships'],
      dtype=object)

In [57]:
from sklearn.metrics import classification_report
print(classification_report(original_test_labels, preds_test, target_names= le.classes_))

                          precision    recall  f1-score   support

               addiction       0.99      1.00      0.99        95
        anger-management       1.00      0.99      0.99        89
                 anxiety       0.79      0.76      0.78       106
       behavioral-change       0.98      0.98      0.98       120
    children-adolescents       1.00      1.00      1.00        89
 counseling-fundamentals       0.72      0.78      0.75        96
              depression       0.84      0.56      0.67       110
               diagnosis       0.98      1.00      0.99        90
       domestic-violence       0.97      1.00      0.99       113
        eating-disorders       1.00      1.00      1.00       107
         family-conflict       0.97      0.78      0.87        99
          grief-and-loss       0.96      1.00      0.98       108
         human-sexuality       0.98      1.00      0.99        93
                intimacy       0.94      0.62      0.75       104
        l