In [1]:
import wandb
import torch
import numpy as np
import pandas as pd
import logging
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs
from sklearn.metrics import f1_score, accuracy_score

ModuleNotFoundError: No module named 'wandb'

In [None]:
# see GPU avaialability
cuda_available = torch.cuda.is_available()

In [None]:
# import data
data = pd.read_csv('OneHot_Combined_cln_utf8.tsv', sep='\t')
data = data.iloc[-1000:,:]

# reformat data
sdg_lst = ['SDG1','SDG2','SDG3','SDG4','SDG5','SDG6','SDG7','SDG8','SDG9','SDG10','SDG11','SDG12','SDG13','SDG14','SDG15','SDG16','SDG17']
data['y'] = data[sdg_lst].values.tolist()
y = data['y']
X = data['abstract']

# plot ratio of data
class_weight = (data[sdg_lst].sum()/ data[sdg_lst].sum().sum())
print('% PER CLASS:\n\n', class_weight*100)
data[sdg_lst].sum().plot.bar()

# split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=42)

# add data to dataframes
train_df = pd.DataFrame()
train_df['text'] = X_train
train_df['labels'] = y_train
train_df.reset_index(inplace=True, drop=True)

eval_df = pd.DataFrame()
eval_df['text'] = X_val
eval_df['labels'] = y_val
eval_df.reset_index(inplace=True, drop=True)

# get number of classes
label_count = len(sdg_lst)

In [None]:
# parameter optimisation
sweep_config = {
    "method": "bayes",  # bayes, grid, random
    "metric": {"name": "train_loss", "goal": "minimize"},
    "parameters": {
        "num_train_epochs": {"min": 1, "max": 10},
        "learning_rate": {"min": 5e-5, "max": 4e-4},
        "train_batch_size":{"min": 5, "max": 15},
        "eval_batch_size":{"min": 5, "max": 15},
        "warmup_steps":{"min": 50, "max": 500},
        "weight_decay":{"min": 0.01, "max": 0.1},
        "logging_steps":{"values": [2, 5, 10]},
    },
}

sweep_id = wandb.sweep(sweep_config, entity='sasdghub', project="sasdghub_ml_classify")

# logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
# Optional model configuration
model_args = MultiLabelClassificationArgs(fp16= False,
                                          threshold=0.8,
                                          manual_seed = 4,
                                          use_multiprocessing = True,
                                          overwrite_output_dir=True,
                                          evaluate_during_training = True,
                                          # wandb_project = 'sasdghub_ml_classify',
                                          # wandb_kwargs={
                                          #     'entity':'sasdghub'
                                          #              },
                                          # num_train_epochs=1,
                                          # train_batch_size= 16,
                                          # eval_batch_size= 64,
                                          # warmup_steps= 500,
                                          # weight_decay= 0.01,
                                          # logging_steps= 10,
                                          # learning_rate= 5e-5
                                          
                                         )

def train():
    
    # Initialize a new wandb run 
    wandb.init()

    # Create a MultiLabelClassificationModel
    model = MultiLabelClassificationModel(
        "xlnet",
        "xlnet-base-cased",
        num_labels=label_count,
        args=model_args,
        use_cuda=cuda_available,
        pos_weight=list(1-class_weight),
        # show_running_loss=True,
        sweep_config=wandb.config,
    )
    
    # Train the model
    model.train_model(train_df,
                      verbose=True,
                      eval_df=eval_df)
    
    # Evaluate the model
    result, model_outputs, wrong_predictions = model.eval_model(
        eval_df,
        verbose=True,
    )
    
    # Sync wandb
    wandb.join()
    
wandb.agent(sweep_id, train)

In [None]:
# train with optimal parameters
# # logging
# logging.basicConfig(level=logging.INFO)
# transformers_logger = logging.getLogger("transformers")
# transformers_logger.setLevel(logging.WARNING)

# # Optional model configuration
# model_args = MultiLabelClassificationArgs(fp16= False,
#                                           threshold=0.8,
#                                           manual_seed = 4,
#                                           use_multiprocessing = True,
#                                           overwrite_output_dir=True,
#                                           evaluate_during_training = True,
#                                           wandb_project = 'sasdghub_ml_classify',
#                                           wandb_kwargs={
#                                               'entity':'sasdghub'
#                                                        },
#                                           num_train_epochs=1,
#                                           train_batch_size= 16,
#                                           eval_batch_size= 64,
#                                           warmup_steps= 500,
#                                           weight_decay= 0.01,
#                                           logging_steps= 10,
#                                           learning_rate= 5e-5
                                          
#                                          )

# # Create a MultiLabelClassificationModel
# model = MultiLabelClassificationModel(
#     "xlnet",
#     "xlnet-base-cased",
#     num_labels=label_count,
#     args=model_args,
#     use_cuda=cuda_available,
#     pos_weight=list(1-class_weight),
    # show_running_loss=True,
    # sweep_config=wandb.config,
# )

# # Train the model
# model.train_model(train_df,
#                   verbose=True,
#                   eval_df=eval_df)

# # Evaluate the model
# result, model_outputs, wrong_predictions = model.eval_model(
#     eval_df,
#     verbose=True,
# )

In [None]:
# Make predictions with the model
# predictions, raw_outputs = model.predict(["sanitation and clean drinking water"])

In [None]:
########################################################################################
# modify so it can digest longer text (split training data into paragraphs that can be digested)

# save model
# apply new model in apply script

# make testing script with ROC curves and confusion matrices

# get model embedding
# train classifier for targets