#Import

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 8.0 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 40.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 63.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Fo

In [None]:
import pandas as pd
import numpy as np
import logging
from matplotlib import pyplot as plt

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [None]:
from scipy.special import softmax
from google.colab import files

#Data preparation

##torch dataset

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])



##Metrics

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

##import data

In [None]:
tra_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project/4192/Data/DirectCompare/train_df.csv")

tra_df = tra_df[['selftext','Expert-label']]
tra_df[['Expert-label']] = tra_df[['Expert-label']].astype(int)
tra_df

Unnamed: 0,selftext,Expert-label
0,I posted this on Piazza but thought I might as...,1
1,"Hi i’ve applied for arts from Vancouver,BC as ...",0
2,i'm an international student and i've been tak...,1
3,i'm an international student and the midterm w...,1
4,they think i wouldnt be able to handle the str...,0
...,...,...
997,My boyfriend is Canadian and I’m American. Obv...,0
998,Do you need to be vaccinated to travel domesti...,0
999,"Hello, are there any International students he...",1
1000,Will you guys take a leave of absence? Or are ...,0


In [None]:
tra_df = tra_df.iloc[:400]
tra_df

Unnamed: 0,selftext,Expert-label
0,I posted this on Piazza but thought I might as...,1
1,"Hi i’ve applied for arts from Vancouver,BC as ...",0
2,i'm an international student and i've been tak...,1
3,i'm an international student and the midterm w...,1
4,they think i wouldnt be able to handle the str...,0
...,...,...
395,"Hey guys, I think no one has a specific idea a...",1
396,Hey everyone\n\nI'm an international student c...,0
397,I'm an international student. Santa Ono posted...,1
398,I’m an international student and I was hoping ...,1


----------------------------------------------

----------------------------------------------

In [None]:
test_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project/4192/Data/Validation988/988validation.csv")

test_df = test_df[['selftext']]
test_df.insert(test_df.shape[1], 'label', 1)
test_df

Unnamed: 0,selftext,label
0,Well... I think I need some help... about depr...,1
1,I am an international student and i started en...,1
2,i'm an international student and the midterm w...,1
3,"Honestly just want to end it all , it’s so har...",1
4,"Hi all, \n\nI'm really upset to know the Winte...",1
...,...,...
983,WE NEED A SCIENCE AND DATA BASED APPROACH TO C...,1
984,Hi everyone! Hope that all who are applying ar...,1
985,Hello everyone !\n\nI request some advice from...,1
986,"Hi, I was wondering what my chances were at th...",1


---------------------------------------------------------------


----------------------------------------------

In [None]:
ret_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/project/social-science-nserc-data-20211110T102352Z-001/social-science-nserc-data/Individual-classifier-results-Iteration-1/International-Students/posts-with-International-student-keyword.csv")

ret_df = ret_df[['selftext']]
ret_df

Unnamed: 0,selftext
0,Hey there guys-\n\nNew international student f...
1,Have any American international students been ...
2,Because of a pre-existing condition I’m eligib...
3,"Hey, I’m an international student and I was wo..."
4,I heard from a business owner that most of the...
...,...
5801,Hey guys!\n\n&amp;#x200B;\n\nJust wanted to le...
5802,I am currently international student studying ...
5803,"Hi guys my name is Emre,\n\n&amp;#x200B;\n\nI ..."
5804,Hi everyone. \n\n\nI'm an international stude...


In [None]:
ret_data = ret_df['selftext'].values.tolist()

#model

In [None]:
model_name = "nlp4good/psych-search"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name,num_labels =2 )

Downloading:   0%|          | 0.00/223k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/323 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/620 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at nlp4good/psych-search were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlp4good/psych-search a

In [None]:
X_train = tra_df["selftext"].values.tolist()
y_train = tra_df["Expert-label"].values.tolist()
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
train_dataset = Dataset(X_train_tokenized, y_train)

In [None]:
X_val = test_df["selftext"].values.tolist()
y_val = test_df["label"].values.tolist()
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)
val_dataset = Dataset(X_val_tokenized, y_val)

In [None]:
X_ret=ret_data
X_ret_tokenized = tokenizer(X_ret, padding=True, truncation=True, max_length=512)
ret_dataset = Dataset(X_ret_tokenized) 

In [None]:


args = TrainingArguments(
  output_dir="output",
  evaluation_strategy="epoch",
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  num_train_epochs=3,
  seed=0,
  overwrite_output_dir=True,
  learning_rate=3e-5,
  gradient_accumulation_steps=16

    
)

trainer_sci = Trainer(
  model=model,
  args=args,
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
  compute_metrics=compute_metrics,
  
  
)

trainer_sci.train()


***** Running training *****
  Num examples = 400
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 18


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
0,No log,1.217726,0.093117,1.0,0.093117,0.17037
1,No log,1.554367,0.048583,1.0,0.048583,0.092664
2,No log,1.473078,0.074899,1.0,0.074899,0.13936


***** Running Evaluation *****
  Num examples = 988
  Batch size = 4
***** Running Evaluation *****
  Num examples = 988
  Batch size = 4
***** Running Evaluation *****
  Num examples = 988
  Batch size = 4


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=18, training_loss=0.5801860491434733, metrics={'train_runtime': 126.1016, 'train_samples_per_second': 9.516, 'train_steps_per_second': 0.143, 'total_flos': 311523489546240.0, 'train_loss': 0.5801860491434733, 'epoch': 2.96})

#predict

##result

In [None]:
raw_pred, _, _ = trainer_sci.predict(ret_dataset) 
raw_pred

***** Running Prediction *****
  Num examples = 5806
  Batch size = 4


array([[ 0.01685832,  0.27289894],
       [-0.44227722,  0.920131  ],
       [-0.05304551,  0.58569485],
       ...,
       [ 0.5106395 , -0.38239172],
       [ 0.82295066, -0.41425088],
       [-0.31863758,  0.22177608]], dtype=float32)

In [None]:
probabilities = softmax(raw_pred, axis=1)
probabilities

array([[0.43633723, 0.56366265],
       [0.2038492 , 0.7961508 ],
       [0.34553134, 0.6544687 ],
       ...,
       [0.7095152 , 0.2904847 ],
       [0.77507645, 0.22492346],
       [0.36809137, 0.6319087 ]], dtype=float32)

In [None]:
proba_ret_df=pd.DataFrame(probabilities)
proba_ret_df.columns = ["remove","predict_result"]
proba_ret_df = proba_ret_df["predict_result"]
proba_ret_df

0       0.563663
1       0.796151
2       0.654469
3       0.728491
4       0.575214
          ...   
5801    0.567800
5802    0.444794
5803    0.290485
5804    0.224923
5805    0.631909
Name: predict_result, Length: 5806, dtype: float32

In [None]:
con_df = pd.concat([ret_df,proba_ret_df],axis=1)

con_df.sort_values(by='predict_result',axis=0,ascending=False,inplace=True)
p_i1 = con_df.iloc[:200]
p_i1

Unnamed: 0,selftext,predict_result
1100,Did any international student apply for it?,0.932713
69,For international students in Canada would our...,0.919785
934,Am i considered an international student?,0.914675
89,I am just curious if covid has affected intern...,0.910859
4523,Can bcap provide international students with l...,0.908055
...,...,...
557,Who can I contact to for Quarantine related qu...,0.830052
256,I’m an international student from Korea and wa...,0.829715
1123,Would I pay the same tuition of an internation...,0.829005
70,"Like for international students, can they go h...",0.828859


In [None]:
#Saving to drive

p_i1.to_csv('p_i1.csv')
files.download('p_i1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
con_df.drop(con_df.head(200).index,inplace=True)
con_df

Unnamed: 0,selftext,predict_result
3088,I'm a new 100 level international student and ...,0.827939
3092,Just an international student trying to figure...,0.827929
358,Im curious if there are any international stud...,0.827314
897,Hey guys! So i uploaded all the required docum...,0.826447
486,Does anyone know why the university allowed te...,0.826431
...,...,...
5494,Ok so I've finished my first year and just got...,0.082979
5178,Hi!\n\nThe title pretty much sums up my confus...,0.078646
3976,Hello all. I have been struggling between thes...,0.078443
1493,"Hi everyone,\n\nHope you all and your familie...",0.077101


In [None]:
#Saving to drive
con_df.to_csv('p_i1_ret.csv')
files.download('p_i1_ret.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

##988 recall

In [None]:
res = trainer_sci.evaluate()
print(res['eval_recall'])

***** Running Evaluation *****
  Num examples = 988
  Batch size = 4


0.07489878542510121
