#Import

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 46.9 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.2 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 52.9 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

In [None]:
import pandas as pd
import numpy as np
import logging
from matplotlib import pyplot as plt

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [None]:
from scipy.special import softmax
from google.colab import files

#Data preparation

##torch dataset

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])



##Metrics

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

##import data

In [None]:
tra_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project/4192/Data/DirectCompare/train_df.csv")

tra_df = tra_df[['selftext','Expert-label']]
tra_df[['Expert-label']] = tra_df[['Expert-label']].astype(int)
tra_df

Unnamed: 0,selftext,Expert-label
0,I posted this on Piazza but thought I might as...,1
1,"Hi i’ve applied for arts from Vancouver,BC as ...",0
2,i'm an international student and i've been tak...,1
3,i'm an international student and the midterm w...,1
4,they think i wouldnt be able to handle the str...,0
...,...,...
997,My boyfriend is Canadian and I’m American. Obv...,0
998,Do you need to be vaccinated to travel domesti...,0
999,"Hello, are there any International students he...",1
1000,Will you guys take a leave of absence? Or are ...,0


In [None]:
tra_df = tra_df.iloc[:400]
tra_df

Unnamed: 0,selftext,Expert-label
0,I posted this on Piazza but thought I might as...,1
1,"Hi i’ve applied for arts from Vancouver,BC as ...",0
2,i'm an international student and i've been tak...,1
3,i'm an international student and the midterm w...,1
4,they think i wouldnt be able to handle the str...,0
...,...,...
395,"Hey guys, I think no one has a specific idea a...",1
396,Hey everyone\n\nI'm an international student c...,0
397,I'm an international student. Santa Ono posted...,1
398,I’m an international student and I was hoping ...,1


In [None]:
#concat iteration result
it_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project/4192/draft/Iteration/simualte_hrir/Iteration1/psybert/p_i1_val.csv")

it_df = it_df[['selftext','Expert-label']]
it_df

Unnamed: 0,selftext,Expert-label
0,Did any international student apply for it?,0.0
1,For international students in Canada would our...,1.0
2,Am i considered an international student?,1.0
3,I am just curious if covid has affected intern...,1.0
4,Can bcap provide international students with l...,1.0
...,...,...
195,Who can I contact to for Quarantine related qu...,1.0
196,I’m an international student from Korea and wa...,1.0
197,Would I pay the same tuition of an internation...,1.0
198,"Like for international students, can they go h...",1.0


In [None]:
train_df = pd.concat([tra_df,it_df])
train_df[['Expert-label']] = train_df[['Expert-label']].astype(int)
train_df

Unnamed: 0,selftext,Expert-label
0,I posted this on Piazza but thought I might as...,1
1,"Hi i’ve applied for arts from Vancouver,BC as ...",0
2,i'm an international student and i've been tak...,1
3,i'm an international student and the midterm w...,1
4,they think i wouldnt be able to handle the str...,0
...,...,...
195,Who can I contact to for Quarantine related qu...,1
196,I’m an international student from Korea and wa...,1
197,Would I pay the same tuition of an internation...,1
198,"Like for international students, can they go h...",1


----------------------------------------------

----------------------------------------------

In [None]:
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project/4192/Data/Validation988/988validation.csv")

test_df = test_df[['selftext']]
test_df.insert(test_df.shape[1], 'label', 1)
test_df

Unnamed: 0,selftext,label
0,Well... I think I need some help... about depr...,1
1,I am an international student and i started en...,1
2,i'm an international student and the midterm w...,1
3,"Honestly just want to end it all , it’s so har...",1
4,"Hi all, \n\nI'm really upset to know the Winte...",1
...,...,...
983,WE NEED A SCIENCE AND DATA BASED APPROACH TO C...,1
984,Hi everyone! Hope that all who are applying ar...,1
985,Hello everyone !\n\nI request some advice from...,1
986,"Hi, I was wondering what my chances were at th...",1


---------------------------------------------------------------


----------------------------------------------

In [None]:
ret_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project/4192/draft/Iteration/simualte_hrir/Iteration1/psybert/p_i1_ret.csv")

ret_df = ret_df[['selftext']]
ret_df

Unnamed: 0,selftext
0,I'm a new 100 level international student and ...
1,Just an international student trying to figure...
2,Im curious if there are any international stud...
3,Hey guys! So i uploaded all the required docum...
4,Does anyone know why the university allowed te...
...,...
5601,Ok so I've finished my first year and just got...
5602,Hi!\n\nThe title pretty much sums up my confus...
5603,Hello all. I have been struggling between thes...
5604,"Hi everyone,\n\nHope you all and your familie..."


In [None]:
ret_data = ret_df['selftext'].values.tolist()

#model

In [None]:
model_name = "nlp4good/psych-search"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name,num_labels =2 )

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/620 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at nlp4good/psych-search were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlp4good/psych-search a

In [None]:
X_train = train_df["selftext"].values.tolist()
y_train = train_df["Expert-label"].values.tolist()
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
train_dataset = Dataset(X_train_tokenized, y_train)

In [None]:
X_val = test_df["selftext"].values.tolist()
y_val = test_df["label"].values.tolist()
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
X_ret=ret_data
X_ret_tokenized = tokenizer(X_ret, padding=True, truncation=True, max_length=512)
ret_dataset = Dataset(X_ret_tokenized) 

In [None]:


args = TrainingArguments(
  output_dir="output",
  evaluation_strategy="epoch",
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  num_train_epochs=3,
  seed=0,
  overwrite_output_dir=True,
  learning_rate=3e-5,
  gradient_accumulation_steps=16

    
)

trainer_sci = Trainer(
  model=model,
  args=args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
  compute_metrics=compute_metrics,
  
  
)

trainer_sci.train()


***** Running training *****
  Num examples = 600
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 27


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,No log,1.420084,0.039474,1.0,0.039474,0.075949
1,No log,1.3106,0.093117,1.0,0.093117,0.17037
2,No log,1.507162,0.07085,1.0,0.07085,0.132325


***** Running Evaluation *****
  Num examples = 988
  Batch size = 4
***** Running Evaluation *****
  Num examples = 988
  Batch size = 4
***** Running Evaluation *****
  Num examples = 988
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=27, training_loss=0.5148850900155527, metrics={'train_runtime': 160.2318, 'train_samples_per_second': 11.234, 'train_steps_per_second': 0.169, 'total_flos': 467285234319360.0, 'train_loss': 0.5148850900155527, 'epoch': 2.96})

#predict

##result

In [None]:
raw_pred, _, _ = trainer_sci.predict(ret_dataset) 
raw_pred

***** Running Prediction *****
  Num examples = 5606
  Batch size = 4


array([[-0.5644072 ,  0.57302034],
       [-0.26231828,  0.5986286 ],
       [-1.0295659 ,  1.392652  ],
       ...,
       [ 1.1600449 , -1.4670719 ],
       [ 0.69821185, -1.2520481 ],
       [ 0.83431715, -1.2797601 ]], dtype=float32)

In [None]:
probabilities = softmax(raw_pred, axis=1)
probabilities

array([[0.24279298, 0.757207  ],
       [0.29714155, 0.70285845],
       [0.08149408, 0.9185059 ],
       ...,
       [0.9325865 , 0.06741349],
       [0.875475  , 0.12452502],
       [0.8922638 , 0.10773608]], dtype=float32)

In [None]:
proba_ret_df=pd.DataFrame(probabilities)
proba_ret_df.columns = ["remove","predict_result"]
proba_ret_df = proba_ret_df["predict_result"]
proba_ret_df

0       0.757207
1       0.702858
2       0.918506
3       0.943248
4       0.947408
          ...   
5601    0.087600
5602    0.082346
5603    0.067413
5604    0.124525
5605    0.107736
Name: predict_result, Length: 5606, dtype: float32

In [None]:
con_df = pd.concat([ret_df,proba_ret_df],axis=1)

con_df.sort_values(by='predict_result',axis=0,ascending=False,inplace=True)
p_i1 = con_df.iloc[:200]
p_i1

Unnamed: 0,selftext,predict_result
98,If you are allowed to get a test for a reason ...,0.962281
113,Does anyone know why international students ha...,0.961153
59,Any international student here changed their l...,0.960297
125,Hi im an international student in Canada (not ...,0.958196
41,Has anyone had an in-person appointment recent...,0.955988
...,...,...
238,"Would uvic really make labs, smaller classes, ...",0.885739
839,I am an international student attending a 1 ye...,0.885203
318,Does anyone know the cost or an estimate of tu...,0.884523
64,Does anyone know any good off campus psycholog...,0.884426


In [None]:
#Saving to drive

p_i1.to_csv('p_i2.csv')
files.download('p_i2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
con_df.drop(con_df.head(200).index,inplace=True)
con_df

Unnamed: 0,selftext,predict_result
260,if ever my student visa won’t make it by fall ...,0.883810
755,As an international student what bank will be ...,0.883758
211,Hey! I'm an international student from India h...,0.883726
85,I'm a first year international student and I r...,0.883527
70,I am planning to go to CST January Intake 2022...,0.883260
...,...,...
5600,Hi everybody thanks for helping!\n\nIm in grad...,0.077212
5593,Hello everyone. I understand that this topic h...,0.074915
5598,"Hello, I'm a high school senior considering Wa...",0.070944
5568,"Hey everyone, I hope you’re doing well and sta...",0.070644


In [None]:
#Saving to drive
con_df.to_csv('p_i2_ret.csv')
files.download('p_i2_ret.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##988 recall

In [None]:
res = trainer_sci.evaluate()
print(res['eval_recall'])

***** Running Evaluation *****
  Num examples = 988
  Batch size = 4


0.0708502024291498
