In [1]:
! pip install pytorch_pretrained_bert
! pip install torchmetrics
! pip install -U kaleido

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch_pretrained_bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 5.1 MB/s 
Collecting boto3
  Downloading boto3-1.26.26-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 43.0 MB/s 
Collecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting botocore<1.30.0,>=1.29.26
  Downloading botocore-1.29.26-py3-none-any.whl (10.2 MB)
[K     |████████████████████████████████| 10.2 MB 44.9 MB/s 
[?25hCollecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 4.8 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.13-py2.py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 18.6 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.w

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/MyDrive/Colab Notebooks/Capstone')
from utils import read_conll_file, read_data, filter_tag
from utils import TAG2IDX, IDX2TAG, DATA_DIR, WSJ_DIR, MODEL_DIR, RESULT_DIR, PLOT_TAGS_DIR
from utils import wsj_train_word_lst, wsj_train_tag_lst, wsj_test_word_lst, wsj_test_tag_lst

from build_model import PosDataset, Net, DEVICE, TOKENIZER
from build_model import pad, train, eval

from analysis import analysis_output

import os
import re
from collections import Counter
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from tqdm import tqdm_notebook as tqdm

import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer, BertModel
from torchmetrics.functional.classification import multiclass_f1_score, multiclass_precision, multiclass_recall, multiclass_accuracy

torch.manual_seed(0)


Mounted at /content/drive
The number of samples: 30060
The number of tags 48
The number of samples: 1336
The number of tags 45
The number of samples: 1640
The number of tags 45


100%|██████████| 213450/213450 [00:00<00:00, 3659797.03B/s]


<torch._C.Generator at 0x7f4884d7bad0>

In [3]:
domain = "newsgroups"
sub_result_dir = os.path.join(RESULT_DIR, "Online_nonfixed_self_learning", domain)
sub_plots_tags_dir = os.path.join(PLOT_TAGS_DIR, "Online_nonfixed_self_learning", domain)

In [4]:
threshold = 0.02
top_percent = 0.6

csv_lst = [name for name in os.listdir(sub_plots_tags_dir) if "csv" in name and f"top{top_percent}-threshold{threshold}" in name]
csv_lst.sort(key=lambda x: int(re.findall(r'\d+', x)[-1]))
csv_lst

['top0.6-threshold0.02-loop0.csv',
 'top0.6-threshold0.02-loop1.csv',
 'top0.6-threshold0.02-loop2.csv',
 'top0.6-threshold0.02-loop3.csv',
 'top0.6-threshold0.02-loop4.csv',
 'top0.6-threshold0.02-loop5.csv',
 'top0.6-threshold0.02-loop6.csv',
 'top0.6-threshold0.02-loop7.csv',
 'top0.6-threshold0.02-loop8.csv']

In [5]:
all_output_df = None

for loopi, name in enumerate(csv_lst):
  csv_file_name = os.path.join(sub_plots_tags_dir, name)
  output_i = pd.read_csv(csv_file_name)
  if all_output_df is None:
    all_output_df = output_i[["POS_id", "POS_tags", "cnt"]]
  
  all_output_df[loopi] = output_i["acc"]

In [6]:
print(len(all_output_df))
all_output_df = all_output_df.sort_values(by="cnt", ascending=False)

47


In [7]:
fig_bar = make_subplots()

# Add traces
fig_bar.add_trace(
    go.Bar(
        x=all_output_df["POS_tags"], y=all_output_df["cnt"], 
        name="count", opacity=0.5)
)
# Set x-axis title
fig_bar.update_xaxes(title_text="xaxis title")

# Set y-axes titles
fig_bar.update_yaxes(title_text="<b>The number of tags</b>", secondary_y=False)

In [8]:
def show_acc_change(range_lst=range(10), all_output_df=all_output_df):
  # Create figure with secondary y-axis
  all_output_df = all_output_df.sort_values(by="cnt", ascending=False)

  fig = make_subplots()

  for i in range_lst:
    
    tag = all_output_df.iloc[i]["POS_tags"]
    acc_lst = all_output_df.iloc[i].tolist()[3:]

    fig.add_trace(
        go.Scatter(
            x=list(range(len(acc_lst))), y=acc_lst,
            mode='markers+lines', name=tag)
    )

  # Add figure title
  fig.update_layout(title_text="How accuracy change during self-training")

  # Set x-axis title
  fig.update_xaxes(title_text="Loop")

  # Set y-axes titles
  fig.update_yaxes(title_text="<b>Accuracy</b>", secondary_y=False)

  fig.show()

In [9]:
show_acc_change(range(10), all_output_df=all_output_df)

In [10]:
show_acc_change(range(10, 20), all_output_df=all_output_df)

In [11]:
show_acc_change(range(20, 35), all_output_df=all_output_df)

In [12]:
show_acc_change(range(35, len(all_output_df)), all_output_df=all_output_df)

In [None]:
# # Create figure with secondary y-axis

# fig = make_subplots()

# for i in range(10):
  
#   tag = all_output_df.iloc[i]["POS_tags"]
#   acc_lst = all_output_df.iloc[i].tolist()[3:]

#   fig.add_trace(
#       go.Scatter(
#           x=list(range(len(acc_lst))), y=acc_lst,
#           mode='markers+lines', name=tag)
#   )

# # Add figure title
# fig.update_layout(
#     title_text="How accuracy change during self-training"
# )

# # Set x-axis title
# fig.update_xaxes(title_text="xaxis title")

# # Set y-axes titles
# fig.update_yaxes(title_text="<b>Accuracy</b>", secondary_y=False)

In [None]:
# Plot by seaborn

# fig = plt.figure(figsize=(20,6))

# for i in range(len(all_output_df)):
#   tag = all_output_df.iloc[i]["POS_tags"]
#   acc_lst = all_output_df.iloc[i].tolist()[3:]
#   p1 = sns.scatterplot(x=range(len(acc_lst)), y=acc_lst, s=30, legend=False, label=tag)  
#   sns.lineplot(x=range(len(acc_lst)), y=acc_lst)

# # plt.legend(loc='lower left', ncol=3, fontsize=8)

# # Put a legend to the right of the current axis
# plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))