In [1]:
import pandas as pd
df_train = pd.read_csv('/content/drive/MyDrive/SMM4H 2024/SMM4H-2024-Task5-Training.tsv', sep='\t')
df_dev = pd.read_csv('/content/drive/MyDrive/SMM4H 2024/SMM4H-2024-Task5-Validation.tsv', sep='\t')
df_test = pd.read_csv('/content/drive/MyDrive/SMM4H 2024/SMM4H-2024-Task5-Test-Unlabeled.tsv', sep='\t')

In [2]:
!pip install transformers seqeval torch tqdm accelerate sentencepiece datasets huggingface

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl (2.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux

In [3]:
for df in [df_train, df_dev, df_test]:
    df['text'] = df['text'].str.replace(r'http\S+|www.\S+', '', case=False)
    df['text'] = df['text'].str.replace(r'@\S+|#\S+', '')
    df['text'] = df['text'].str.replace(r'\s+', ' ', case=False)

In [4]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
class TextDataset(Dataset):
    def __init__(self, tokenizer, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = labels if labels is not None else [0] * len(texts)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
model = AutoModelForSequenceClassification.from_pretrained('1024m/SMM4H-Task5-BartL-1A')                              # HERE
model = model.to('cuda')
test_texts = df_test['text'].tolist()
test_dataset = TextDataset(tokenizer, test_texts)
test_loader = DataLoader(test_dataset, batch_size=36, shuffle=False)
predictions = []
for batch in tqdm(test_loader, desc="Evaluating"):
    inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    pred_labels = logits.argmax(1)
    predictions.extend(zip(pred_labels.cpu().numpy(), probabilities[:,0].cpu().numpy(), probabilities[:,1].cpu().numpy()))
df_predictions_8_test = pd.DataFrame(predictions, columns=['predicted_label', 'prediction_0_weight', 'prediction_1_weight'])          # HERE
df_merged_8_test = pd.concat([df_test, df_predictions_8_test], axis=1)                                                                      # HERE  # HERE
df_merged_8_test.to_csv('bartL_task5_submissions_01_epoch8.csv', index=False)                                                          # HERE  # HERE

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Evaluating:   0%|          | 0/278 [00:00<?, ?it/s]

In [5]:
df_merged_8_test

Unnamed: 0,tweet_id,text,predicted_label,prediction_0_weight,prediction_1_weight
0,1266009978743160832,@TeaSpillYT My 4 year old daughter has autism ...,1,0.000018,0.999980
1,1319141585666400257,"""Is He / She Distracted? Considerations When D...",0,0.999592,0.000405
2,1321605464644296705,@al_c0h0lic Apparently it wasn’t about politic...,0,0.999986,0.000013
3,841289449124294656,It ain't easy but I will always be there for m...,1,0.000031,0.999967
4,1174713585135734784,Way to go @Kodileerocks !!! Big congratulation...,1,0.000033,0.999966
...,...,...,...,...,...
9995,1350119143333355521,@bradainsworth #IWould love a takeaway this ev...,1,0.000022,0.999976
9996,1473120270118445064,"Husband works from home, my son has asthma and...",1,0.000016,0.999982
9997,917699673929986048,Should we give different media guidance for yo...,0,0.999644,0.000352
9998,1289241342921474049,"@petti_crocker @KillerMartinis Honestly, we ar...",1,0.112225,0.887756


In [7]:
df_merged_8_test['predicted_label'].value_counts()

predicted_label
0    5875
1    4125
Name: count, dtype: int64

In [9]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
class TextDataset(Dataset):
    def __init__(self, tokenizer, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
        self.labels = labels if labels is not None else [0] * len(texts)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.encodings['input_ids'])
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large')
model = AutoModelForSequenceClassification.from_pretrained('1024m/SMM4H-Task5-BartL-1B')                              # HERE
model = model.to('cuda')
test_texts = df_test['text'].tolist()
test_dataset = TextDataset(tokenizer, test_texts)
test_loader = DataLoader(test_dataset, batch_size=36, shuffle=False)
predictions = []
for batch in tqdm(test_loader, desc="Evaluating"):
    inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=1)
    pred_labels = logits.argmax(1)
    predictions.extend(zip(pred_labels.cpu().numpy(), probabilities[:,0].cpu().numpy(), probabilities[:,1].cpu().numpy()))
df_predictions_18_test = pd.DataFrame(predictions, columns=['predicted_label', 'prediction_0_weight', 'prediction_1_weight'])          # HERE
df_merged_18_test = pd.concat([df_test, df_predictions_18_test], axis=1)                                                                      # HERE  # HERE
df_merged_18_test.to_csv('bartL_task5_submissions_01_epoch18.csv', index=False)                                                          # HERE  # HERE

config.json:   0%|          | 0.00/1.76k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Evaluating:   0%|          | 0/278 [00:00<?, ?it/s]

In [10]:
df_merged_18_test

Unnamed: 0,tweet_id,text,predicted_label,prediction_0_weight,prediction_1_weight
0,1266009978743160832,@TeaSpillYT My 4 year old daughter has autism ...,1,0.000002,9.999970e-01
1,1319141585666400257,"""Is He / She Distracted? Considerations When D...",0,0.999999,6.150412e-07
2,1321605464644296705,@al_c0h0lic Apparently it wasn’t about politic...,0,0.999999,1.864913e-07
3,841289449124294656,It ain't easy but I will always be there for m...,1,0.000005,9.999945e-01
4,1174713585135734784,Way to go @Kodileerocks !!! Big congratulation...,1,0.000003,9.999963e-01
...,...,...,...,...,...
9995,1350119143333355521,@bradainsworth #IWould love a takeaway this ev...,1,0.000004,9.999956e-01
9996,1473120270118445064,"Husband works from home, my son has asthma and...",1,0.000003,9.999963e-01
9997,917699673929986048,Should we give different media guidance for yo...,0,0.999998,1.186134e-06
9998,1289241342921474049,"@petti_crocker @KillerMartinis Honestly, we ar...",1,0.000272,9.997270e-01


In [11]:
df_merged_18_test['predicted_label'].value_counts()

predicted_label
0    5859
1    4141
Name: count, dtype: int64

In [12]:
df_submission_1A = df_merged_8_test.copy()
df_submission_1B = df_merged_18_test.copy()

In [13]:
df_submission_1A

Unnamed: 0,tweet_id,text,predicted_label,prediction_0_weight,prediction_1_weight
0,1266009978743160832,@TeaSpillYT My 4 year old daughter has autism ...,1,0.000018,0.999980
1,1319141585666400257,"""Is He / She Distracted? Considerations When D...",0,0.999592,0.000405
2,1321605464644296705,@al_c0h0lic Apparently it wasn’t about politic...,0,0.999986,0.000013
3,841289449124294656,It ain't easy but I will always be there for m...,1,0.000031,0.999967
4,1174713585135734784,Way to go @Kodileerocks !!! Big congratulation...,1,0.000033,0.999966
...,...,...,...,...,...
9995,1350119143333355521,@bradainsworth #IWould love a takeaway this ev...,1,0.000022,0.999976
9996,1473120270118445064,"Husband works from home, my son has asthma and...",1,0.000016,0.999982
9997,917699673929986048,Should we give different media guidance for yo...,0,0.999644,0.000352
9998,1289241342921474049,"@petti_crocker @KillerMartinis Honestly, we ar...",1,0.112225,0.887756


In [14]:
df_submission_1B

Unnamed: 0,tweet_id,text,predicted_label,prediction_0_weight,prediction_1_weight
0,1266009978743160832,@TeaSpillYT My 4 year old daughter has autism ...,1,0.000002,9.999970e-01
1,1319141585666400257,"""Is He / She Distracted? Considerations When D...",0,0.999999,6.150412e-07
2,1321605464644296705,@al_c0h0lic Apparently it wasn’t about politic...,0,0.999999,1.864913e-07
3,841289449124294656,It ain't easy but I will always be there for m...,1,0.000005,9.999945e-01
4,1174713585135734784,Way to go @Kodileerocks !!! Big congratulation...,1,0.000003,9.999963e-01
...,...,...,...,...,...
9995,1350119143333355521,@bradainsworth #IWould love a takeaway this ev...,1,0.000004,9.999956e-01
9996,1473120270118445064,"Husband works from home, my son has asthma and...",1,0.000003,9.999963e-01
9997,917699673929986048,Should we give different media guidance for yo...,0,0.999998,1.186134e-06
9998,1289241342921474049,"@petti_crocker @KillerMartinis Honestly, we ar...",1,0.000272,9.997270e-01


In [15]:
df_submission_1A.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tweet_id             10000 non-null  int64  
 1   text                 10000 non-null  object 
 2   predicted_label      10000 non-null  int64  
 3   prediction_0_weight  10000 non-null  float32
 4   prediction_1_weight  10000 non-null  float32
dtypes: float32(2), int64(2), object(1)
memory usage: 312.6+ KB


In [16]:
df_submission_1B.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   tweet_id             10000 non-null  int64  
 1   text                 10000 non-null  object 
 2   predicted_label      10000 non-null  int64  
 3   prediction_0_weight  10000 non-null  float32
 4   prediction_1_weight  10000 non-null  float32
dtypes: float32(2), int64(2), object(1)
memory usage: 312.6+ KB


In [17]:
df_submission_1A = df_submission_1A.drop(columns=['prediction_0_weight', 'prediction_1_weight'])
df_submission_1A = df_submission_1A.rename(columns={'predicted_label': 'label'})
df_submission_1A

Unnamed: 0,tweet_id,text,label
0,1266009978743160832,@TeaSpillYT My 4 year old daughter has autism ...,1
1,1319141585666400257,"""Is He / She Distracted? Considerations When D...",0
2,1321605464644296705,@al_c0h0lic Apparently it wasn’t about politic...,0
3,841289449124294656,It ain't easy but I will always be there for m...,1
4,1174713585135734784,Way to go @Kodileerocks !!! Big congratulation...,1
...,...,...,...
9995,1350119143333355521,@bradainsworth #IWould love a takeaway this ev...,1
9996,1473120270118445064,"Husband works from home, my son has asthma and...",1
9997,917699673929986048,Should we give different media guidance for yo...,0
9998,1289241342921474049,"@petti_crocker @KillerMartinis Honestly, we ar...",1


In [19]:
df_submission_1B = df_submission_1B.drop(columns=['prediction_0_weight', 'prediction_1_weight'])
df_submission_1B = df_submission_1B.rename(columns={'predicted_label': 'label'})
df_submission_1B

Unnamed: 0,tweet_id,text,label
0,1266009978743160832,@TeaSpillYT My 4 year old daughter has autism ...,1
1,1319141585666400257,"""Is He / She Distracted? Considerations When D...",0
2,1321605464644296705,@al_c0h0lic Apparently it wasn’t about politic...,0
3,841289449124294656,It ain't easy but I will always be there for m...,1
4,1174713585135734784,Way to go @Kodileerocks !!! Big congratulation...,1
...,...,...,...
9995,1350119143333355521,@bradainsworth #IWould love a takeaway this ev...,1
9996,1473120270118445064,"Husband works from home, my son has asthma and...",1
9997,917699673929986048,Should we give different media guidance for yo...,0
9998,1289241342921474049,"@petti_crocker @KillerMartinis Honestly, we ar...",1


In [20]:
df_submission_1A = df_submission_1A.drop(columns=['text'])
df_submission_1B = df_submission_1B.drop(columns=['text'])

In [21]:
df_submission_1A

Unnamed: 0,tweet_id,label
0,1266009978743160832,1
1,1319141585666400257,0
2,1321605464644296705,0
3,841289449124294656,1
4,1174713585135734784,1
...,...,...
9995,1350119143333355521,1
9996,1473120270118445064,1
9997,917699673929986048,0
9998,1289241342921474049,1


In [22]:
df_submission_1B

Unnamed: 0,tweet_id,label
0,1266009978743160832,1
1,1319141585666400257,0
2,1321605464644296705,0
3,841289449124294656,1
4,1174713585135734784,1
...,...,...
9995,1350119143333355521,1
9996,1473120270118445064,1
9997,917699673929986048,0
9998,1289241342921474049,1


In [24]:
df_submission_1A.to_csv('prediction_task5.tsv', sep='\t', index=False, header=True)
!zip predictions_1A.zip prediction_task5.tsv

  adding: prediction_task5.tsv (deflated 55%)


In [25]:
df_submission_1B.to_csv('prediction_task5.tsv', sep='\t', index=False, header=True)
!zip predictions_1B.zip prediction_task5.tsv

  adding: prediction_task5.tsv (deflated 55%)
