Importing Requirements

In [1]:
import os
import ls
import sys
import json

import numpy
import torch
import json
import pickle
import sklearn
import collections
import pandas as pd
from sklearn.metrics import accuracy_score , f1_score

import torch
from ls import utils
from ls.testing import get_predictions
from ls.models.build import ModelFactory

  from .autonotebook import tqdm as notebook_tqdm


HELPER FUNCTIONS

In [2]:
def save_json(data, path):

    with open(path, "w", encoding = "UTF-8") as f:

        json.dump(data, f, indent = 4)

In [3]:
def save_pickle(data, path):

    with open(path, "wb") as f: 
        
        pickle.dump(data, f)

In [7]:
def load_pickle(path):

    with open(path, "rb") as f:

        return pickle.load(f)

TRAINING

Step 1: Download the dataset

In [2]:
data = ls.datasets.Vast2()

Step 2: Train the model (Skip if inference)

In [6]:
best_split = ls.learning_to_split(data, model = {"name" : "bert"}, metric = "accuracy",
                              return_order = ["train_indices", "test_indices","train_data", "test_data", "predictor"])

train_indices = best_split["train_indices"]
test_indices = best_split["test_indices"]
predictor_weights = best_split["predictor"]

In [None]:
print(best_split.keys())
print(best_split["cfg"])
best_split["split_stats"]

Step 3: Save the weights and the indices (skip if inference)

In [43]:
output_folder="./Saved_Logs"
# output_folder="./saved_logs_16082023"

In [None]:
if not os.path.exists(output_folder):
    os.mkdir(output_folder)

torch.save(predictor_weights, os.path.join(output_folder, "predictor_weights.pth"))
save_pickle(best_split["cfg"], os.path.join(output_folder, "cfg.pkl"))
train_indices = pd.DataFrame(train_indices)
test_indices = pd.DataFrame(test_indices)
train_indices.to_csv(os.path.join(output_folder, "train_indices.csv") ,index = False)
test_indices.to_csv(os.path.join(output_folder, "test_indices.csv"), index = False)

INFERENCE

Step 1: Read in the saved indices

In [None]:
test_indices = pd.read_csv(os.path.join(output_folder, "test_indices.csv")).values[:, 0]
print(test_indices[:10])

In [None]:
# (can skip)
filename1 = 'ls/datasets/vast_data/vast_train.csv'
filename2 = 'ls/datasets/vast_data/vast_test.csv'

data_1 = pd.read_csv(filename1)
data_2 = pd.read_csv(filename2)
c_data = pd.concat([data_1, data_2])

In [None]:
#code to put the readable data into csv :>
# (can skip)
traindata = c_data.iloc[train_indices, :]
testdata = c_data.iloc[test_indices, :]

print(traindata.shape)
print(testdata.shape)

traindata.to_csv("final_121/train_data.csv", index = False)
testdata.to_csv("final_121/test_data.csv", index = False)

Step 1: Load in the trained predictor

In [45]:
predictor_weights_path = os.path.join(output_folder, "predictor_weights.pth")
predictor_weights = torch.load(predictor_weights_path, map_location = torch.device("cpu"))
cfg = load_pickle(os.path.join(output_folder, "cfg.pkl"))
# cfg = utils.read_config({})
# cfg['model']['name']='bert'
predictor = ModelFactory.get_model(cfg, predictor = True)
# predictor.load_state_dict(predictor_weights, strict = True)
predictor.eval()


bert(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True

Step 2: Get the test results and save the results

In [None]:
test_indices =pd.read_csv(os.path.join(output_folder,"test_indices.csv"),header=None).values[:, 0]

In [None]:
print(len(test_indices))

In [None]:
# Get the test results
test_results = get_predictions(data = data, test_indices = test_indices, predictor = predictor, cfg = cfg)
test_results["indices"] = test_indices.tolist()

save_json(test_results, os.path.join(output_folder, "test_results.json"))

Evaluating the model test 

In [37]:
test_results_pth = os.path.join(output_folder, "test_results.json")

# Read the JSON file
with open(test_results_pth, 'r') as json_file:
    alldata = json.load(json_file)

y_true = alldata['y']
y_pred= alldata['pred_y']

accuracy= accuracy_score(y_true, y_pred)

f1=f1_score(y_true, y_pred, average='weighted')

print(accuracy)
print(f1)
print(len(y_true))
print(len(y_pred))
print(collections.Counter(y_pred))


0.22792022792022792
0.13219020521217786
4212
4212
Counter({2: 3865, 0: 300, 1: 47})


In [46]:
test_results_pth = os.path.join(output_folder, "test_results.json")

# Read the JSON file
with open(test_results_pth, 'r') as json_file:
    alldata = json.load(json_file)

y_true = alldata['y']
y_pred= alldata['pred_y']

accuracy= accuracy_score(y_true, y_pred)

f1=f1_score(y_true, y_pred, average='weighted')

print(accuracy)
print(f1)
print(len(y_true))
print(len(y_pred))
print(collections.Counter(y_pred))

0.13897837091578463
0.03391633779116522
8692
8692
Counter({1: 8692})


In [3]:
text= "Regulation of corporations has been subverted by corporations. States that incorporate corporations are not equipped to regulate corporations that are rich enough to influence elections, are rich enough to muster a legal team that can bankrupt the state. Money from corporations and their principals cannot be permitted in the political process if democracy is to survive."

target= "companies"

In [30]:
import transformers
from transformers import BertTokenizer

def getsentiment(text,target):
    PRE_TRAINED_MODEL_NAME = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

    #concat and token and tensor all tgt        # padding to ensure all same length [total 400]
    outputs = tokenizer([text], [target], max_length = 400, padding = "max_length")

    # output keys --> input_ids, attention_mask and token_type_ids 
    # changing from list to tensor ( int [long])
    input_ids = torch.tensor(outputs["input_ids"], dtype=torch.long)  #tensored 
    input_masks = torch.tensor(outputs["attention_mask"], dtype = torch.long) #which one has things 
    segment_ids = torch.tensor(outputs["token_type_ids"], dtype = torch.long) #tells code which one is which list

    #concat all to one variable 
    _data = torch.stack((input_ids, input_masks, segment_ids), dim=2)  #following order in bert , dim=2 means it takes the first list first ele and put all tgt   

    #putting it in the same data format 
    t_data= [(_data[0],torch.tensor(0))]

    #getting our results , !! must load predictor above first !!
    results = get_predictions(data = t_data , test_indices = [0], predictor = predictor, cfg = cfg)
    # print(results)
    # a = z

    pred_y=results['pred_y'][0]

    if pred_y==0:
        sentiment="negative"
    elif pred_y==1:
        sentiment="positive"
    else:
        sentiment="neutral"

    prntout=f"Your predicted sentiment is {sentiment}"
    
    return prntout


In [31]:
print(getsentiment(text,target))

tensor([[-0.0587, -0.0806, -0.2851]], device='cuda:3')
Your predicted sentiment is negative
