In [3]:
!pip install sentence_transformers pinecone-client datasets seaborn matplotlib

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pinecone-client
  Downloading pinecone_client-2.2.4-py3-none-any.whl (179 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.4/179.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m90.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading s

In [4]:
!pip install --upgrade torch torchvision



In [1]:
import os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID";
# os.environ["CUDA_VISIBLE_DEVICES"]="0";
os.environ["CUDA_LAUNCH_BLOCKING"]="1";


In [2]:
import torch
from torch.utils.data import Dataset
from datasets import load_dataset
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,TrainingArguments
)

In [3]:


class IMDbDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [4]:
# load the dataset and convert to pandas dataframe
df_tain = load_dataset("fhamborg/news_sentiment_newsmtsc",split="train").to_pandas()

In [5]:
df_validation = load_dataset( "fhamborg/news_sentiment_newsmtsc",split="validation").to_pandas()

In [6]:
df_test = load_dataset("fhamborg/news_sentiment_newsmtsc",split="test").to_pandas()

In [37]:
df_tain.polarity.value_counts()

-1    3316
 0    3028
 1    2395
Name: polarity, dtype: int64

In [7]:
df_test["sentence"][0]

'Though we do not know what other items Seth may have had in his possession, his watch, phone, wallet and necklace were not stolen.'

In [8]:
df_test

Unnamed: 0,mention,polarity,from,to,sentence,id
0,Seth,0,39,43,Though we do not know what other items Seth ma...,polusa_v1_4307505_-1_11_Seth_39_43
1,He,1,0,2,He won the Academy Award for Best Supporting A...,allsides_4276_1784_7_Beloved actor Robin Willi...
2,""" Powell",1,44,52,"""Yup, the whole birther movement was racist,"" ...",allsides_1810_737_6_Powell_44_52
3,Ross,0,0,4,Ross's appointment will require a Senate confi...,allsides_1593_643_5_Wilbur Ross_0_4
4,Andrews,0,27,34,Trustees Simone Boutet and Andrews voted no ea...,polusa_v1_59535720_-1_32_Trustee Deno Andrews_...
...,...,...,...,...,...,...
798,Poe,0,91,94,To have her now say she doesn't support our ri...,allsides_4841_2036_10_Poe_91_94
799,Obama,1,98,103,But some observers suggested it was tone-deaf ...,allsides_2495_1015_21_Obama_98_103
800,Cox,0,95,98,“It is my opinion that you are a key conspirat...,polusa_v1_39150634_-1_5_Sean Cox_95_98
801,Schmidt,-1,104,111,“It is my opinion that you are a key conspirat...,polusa_v1_39150634_-1_5_Schmidt_104_111


In [9]:
clean_1_test = df_test.groupby('polarity', group_keys=False).apply(lambda x: x.sample(min(len(x), 50)))
clean_1_test = clean_1_test.sample(frac=1, random_state=42)
clean_1_test.dropna(subset=['sentence'], inplace=True)
clean_1_test

Unnamed: 0,mention,polarity,from,to,sentence,id
563,Mr Trump,0,0,8,Mr Trump has suggested North Korea's shift was...,polusa_v1_17895268_-1_44_Donald Trump_1_9
456,Sessions,-1,16,24,"But for Bannon, Sessions and Miller, immigrati...",polusa_v1_4325417_-1_13_Sessions_16_24
610,Biden,1,113,118,"(Yuri Gripas/Pool Photo via AP) A crowd of 1,0...","allsides_3158_1295_20_"" Hunter Biden_113_118"
110,Debs,0,59,63,"The Socialist party, founded in 1901, came tog...",polusa_v1_39069327_-1_47_Eugene Debs_60_64
619,Feinstein,0,0,9,Feinstein has not said if she will seek a fift...,polusa_v1_79079669_-1_13_Dianne Feinstein in t...
...,...,...,...,...,...,...
394,Bolton,0,0,6,"Bolton continued, “Their ouster won’t bring sw...",polusa_v1_4297399_-1_12_Bolton_1_7
429,King,1,7,11,"For as King also said, “[t]here comes a time w...",polusa_v1_4540263_-1_8_Dr. King_7_11
34,Clinton,-1,126,133,"At the same time, the debate veered heavily in...",allsides_2308_939_17_Hillary Clinton_126_133
655,Assange,0,1,8,#Assange: Regarding upcoming publications...,allsides_1757_715_8_Assange_1_8


In [10]:
x_train=df_tain["sentence"]
y_train=df_tain["polarity"]

In [11]:
x_valid=df_validation["sentence"]
y_valid=df_validation["polarity"]

In [12]:
x_test=clean_1_test["sentence"]
y_test=clean_1_test["polarity"]

In [13]:
model_id = "cardiffnlp/twitter-roberta-base-sentiment"
# model_id ="lxyuan/distilbert-base-multilingual-cased-sentiments-student"
# model_id ="distilbert-base-uncased"
# model_id ="distilbert-base-uncased-finetuned-sst-2-english"

# model_id ="distilbert-base-uncased-distilled-squad"

# model_id ="Davlan/distilbert-base-multilingual-cased-ner-hrl"
# model_id ="xlnet-base-cased"
# model_id ="t5-small"

# load the model from huggingface
model = AutoModelForSequenceClassification.from_pretrained(model_id,num_labels=3
                                                          #  , ignore_mismatched_sizes=True
                                                           )
tokenizer = AutoTokenizer.from_pretrained(model_id)

Downloading (…)lve/main/config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [14]:
train_encoding = tokenizer(x_train.tolist(), padding='max_length', truncation=True, max_length=512,return_tensors="pt")
valid_encoding = tokenizer(x_valid.tolist(), padding='max_length', truncation=True, max_length=512,return_tensors="pt")
test_encoding = tokenizer(x_test.tolist(), padding='max_length', truncation=True, max_length=512,return_tensors="pt")


In [15]:
# train_encoding=tokenizer(x_train.tolist(),padding=True,truncation=True)
# valid_encoding=tokenizer(x_valid.tolist(),padding=True,truncation=True)
# test_encoding=tokenizer(x_test.tolist(),padding=True,truncation=True)


In [16]:
y_train = y_train.astype(int)
y_valid = y_valid.astype(int)
y_test = y_test.astype(int)


In [17]:
y_train

0       0
1      -1
2      -1
3      -1
4      -1
       ..
8734   -1
8735    0
8736    0
8737    1
8738   -1
Name: polarity, Length: 8739, dtype: int64

In [18]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder on the target labels and transform y_train
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)
y_test_encoded = label_encoder.transform(y_test)



In [19]:
y_train.unique()

array([ 0, -1,  1])

In [20]:
{"1":0,"0":-1,"2":1}

{'1': 0, '0': -1, '2': 1}

In [21]:
y_train_encoded

array([1, 0, 0, ..., 1, 2, 0])

In [22]:
train_dataset=IMDbDataset(train_encoding,y_train_encoded)
valid_dataset=IMDbDataset(valid_encoding,y_valid_encoded)
test_dataset=IMDbDataset(test_encoding,y_test_encoded)


In [23]:
train_dataset

<__main__.IMDbDataset at 0x7a586c7766e0>

In [24]:
training_args = TrainingArguments(
    output_dir='./results',  # Fix the space before the directory path
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_steps=10,
)


In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
    # compute_metrics=compute_metrics,
)

trainer.train()

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss
10,0.9233
20,0.7739
30,0.7857
40,0.7475
50,0.8024
60,0.8098
70,0.7556
80,0.7781
90,0.8487
100,0.6264


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


TrainOutput(global_step=547, training_loss=0.718797703765646, metrics={'train_runtime': 798.8744, 'train_samples_per_second': 10.939, 'train_steps_per_second': 0.685, 'total_flos': 2299348157524992.0, 'train_loss': 0.718797703765646, 'epoch': 1.0})

In [26]:
# output_dir="saved"
# tokenizer.save_pretrained(output_dir)

# model.save_pretrained(output_dir)

In [29]:
trainer.save_model()


trainer.save_state()


checkpoint_name = "test-trainer-deepspeed"
final_model_path = f"./llm04_fine_tuning/{checkpoint_name}"



trainer.save_model(output_dir=final_model_path)

In [30]:
# fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(output_dir)
# tokenizer_new = AutoTokenizer.from_pretrained(output_dir)


In [31]:

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(final_model_path)
# tokenizer_new = AutoTokenizer.from_pretrained(final_model_path)

In [32]:
inputs = tokenizer(x_test.tolist(),return_tensors="pt",padding='max_length', truncation=True, max_length=512)

In [33]:
import torch.nn.functional as F
with torch.no_grad():
    outputs =fine_tuned_model(**inputs)
    print (outputs)
    prediction=F.softmax(outputs.logits, dim=1 )
    print (prediction)

    labels= torch.argmax(prediction, dim=1 )
    print(labels)
    classification= [fine_tuned_model.config.id2label[label_id] for label_id in labels.tolist()]

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.8857,  1.4185, -2.7148],
        [ 0.4719, -0.8339,  0.3291],
        [-1.7108,  0.7952,  0.8962],
        [-2.3353,  1.2322,  0.9683],
        [-1.7786,  2.5907, -1.1760],
        [ 1.1425,  1.0456, -2.4671],
        [-1.6219,  2.7088, -1.4053],
        [-1.1726,  0.9139, -0.0541],
        [ 1.5132,  0.6390, -2.3802],
        [-2.5929,  2.8124, -0.3205],
        [-1.4809,  0.2752,  1.1095],
        [ 2.0373,  0.1815, -2.5885],
        [ 1.6665,  0.5276, -2.6439],
        [ 1.8414,  0.0528, -2.2523],
        [ 1.8052,  0.1838, -2.5019],
        [ 0.9880,  0.5152, -1.7275],
        [-1.0652, -0.3343,  1.5021],
        [ 0.1315,  1.5741, -2.0537],
        [-1.7366,  2.8002, -1.1667],
        [-1.9850,  1.7499, -0.0787],
        [ 0.6008, -0.6656, -0.1233],
        [-1.4968,  0.3373,  1.1166],
        [ 1.9058,  0.0984, -2.5274],
        [-0.9918, -0.4274,  1.5658],
        [-1.7185,  0.6312,  1.0316],
        [-2.6240,  1.4515,  1.07

In [34]:
label_mapping = {"LABEL_1": 0, "LABEL_0": -1, "LABEL_2": 1}

classification_nnn= [label_mapping[label_id] for label_id in classification]

In [35]:
classification_nnn

[0,
 -1,
 1,
 0,
 0,
 -1,
 0,
 0,
 -1,
 0,
 1,
 -1,
 -1,
 -1,
 -1,
 -1,
 1,
 0,
 0,
 0,
 -1,
 1,
 -1,
 1,
 1,
 0,
 1,
 1,
 -1,
 -1,
 -1,
 1,
 0,
 -1,
 -1,
 1,
 0,
 -1,
 -1,
 1,
 -1,
 0,
 0,
 0,
 -1,
 -1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 -1,
 1,
 0,
 -1,
 0,
 -1,
 0,
 -1,
 -1,
 -1,
 -1,
 0,
 -1,
 0,
 -1,
 -1,
 0,
 0,
 -1,
 1,
 1,
 0,
 0,
 1,
 0,
 -1,
 0,
 1,
 -1,
 0,
 0,
 -1,
 -1,
 1,
 -1,
 -1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 -1,
 -1,
 -1,
 1,
 -1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 -1,
 0,
 1,
 0,
 0,
 -1,
 -1,
 1,
 0,
 0,
 0,
 0,
 0,
 -1,
 0,
 0,
 -1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 -1,
 1,
 -1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 -1,
 -1,
 -1,
 1,
 -1,
 0,
 1]

In [36]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test, classification_nnn))

              precision    recall  f1-score   support

          -1       0.79      0.82      0.80        50
           0       0.72      0.88      0.79        50
           1       0.89      0.66      0.76        50

    accuracy                           0.79       150
   macro avg       0.80      0.79      0.79       150
weighted avg       0.80      0.79      0.79       150

