In [1]:
# analyze data
import re
import random
import time
from statistics import mode

from PIL import Image
import numpy as np
import pandas
import torch
import torch.nn as nn
import torchvision
from torchvision import transforms
import torchvision.datasets as datasets

import os
import pandas as pd
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import csv

In [2]:
def process_text(text):
    """
    Process the given text by performing various transformations.

    Args:
        text (str): The input text to be processed.

    Returns:
        str: The processed text after applying the transformations.
    """
    
    # lowercase
    text = text.lower()

    # 数詞を数字に変換
    num_word_to_digit = {
        'zero': '0', 'one': '1', 'two': '2', 'three': '3', 'four': '4',
        'five': '5', 'six': '6', 'seven': '7', 'eight': '8', 'nine': '9',
        'ten': '10'
    }
    for word, digit in num_word_to_digit.items():
        text = text.replace(word, digit)

    # 小数点のピリオドを削除
    text = re.sub(r'(?<!\d)\.(?!\d)', '', text)

    # 冠詞の削除
    text = re.sub(r'\b(a|an|the)\b', '', text)

    # 短縮形のカンマの追加
    contractions = {
        "dont": "don't", "isnt": "isn't", "arent": "aren't", "wont": "won't",
        "cant": "can't", "wouldnt": "wouldn't", "couldnt": "couldn't"
    }
    for contraction, correct in contractions.items():
        text = text.replace(contraction, correct)

    # 句読点をスペースに変換
    text = re.sub(r"[^\w\s':]", ' ', text)

    # 句読点をスペースに変換
    text = re.sub(r'\s+,', ',', text)

    # 連続するスペースを1つに変換
    text = re.sub(r'\s+', ' ', text).strip()

    return text
def process_question(question):
    """
    Process the given question by performing various transformations.

    Args:
        question (str): The input question to be processed.

    Returns:
        str: The processed question after applying the transformations.
    """
    # lowercase
    question = question.lower()
    # remove articles
    question = re.sub(r'\b(a|an|the)\b', '', question)
    # remove punctuation
    question = re.sub(r"[^\w\s':]", ' ', question)
    # remove extra spaces
    question = re.sub(r'\s+', ' ', question).strip()
    return question



In [None]:
class VQADataset(torch.utils.data.Dataset):
    def __init__(self, df_path, image_dir, transform=None, answer=True, class_mapping=None):
        self.transform = transform
        self.image_dir = image_dir
        self.df = pandas.read_json(df_path)
        self.answer = answer
        self.class_mapping = class_mapping
        self.question2idx = {}
        self.answer2idx = {}
        self.idx2question = {}
        self.idx2answer = {}
        times = 0
        for question in self.df["question"]:
            question = process_question(question)
            words = question.split(" ")
            times += 1
            for word in words:
                word = process_question(word)
                if word not in self.question2idx:
                    self.question2idx[word] = len(self.question2idx)
        self.idx2question = {v: k for k, v in self.question2idx.items()}
        if self.answer:
            for answers in self.df["answers"]:
                for answer in answers:
                    word = answer["answer"]
                    word = process_text(word)
                    if word not in self.answer2idx:
                        self.answer2idx[word] = len(self.answer2idx)
            max_id = max(self.answer2idx.values())
            for answer, class_id in class_mapping.items():
                if answer not in self.answer2idx:
                    max_id += 1
                    self.answer2idx[answer] = max_id
            self.idx2answer = {v: k for k, v in self.answer2idx.items()}

        if self.answer:
            # 回答に含まれる単語を辞書に追加
            for answers in self.df["answers"]:
                for answer in answers:
                    word = answer["answer"]
                    word = process_text(word)
                    if word not in self.answer2idx:
                        self.answer2idx[word] = len(self.answer2idx)
            # self.answer2idxの最大IDを取得
            max_id = max(self.answer2idx.values())
            print(max_id)

            # class_mappingのエントリを追加
            for answer, class_id in class_mapping.items():
                if answer not in self.answer2idx:
                    max_id += 1
                    self.answer2idx[answer] = max_id
            print(max_id)
            self.idx2answer = {v: k for k, v in self.answer2idx.items()}  # 逆変換用の辞書(answer)
    def update_dict(self, dataset):
        """
        検証用データ，テストデータの辞書を訓練データの辞書に更新する．

        Parameters
        ----------
        dataset : Dataset
            訓練データのDataset
        """
        self.question2idx = dataset.question2idx
        self.answer2idx = dataset.answer2idx
        self.idx2question = dataset.idx2question
        self.idx2answer = dataset.idx2answer

    def __getitem__(self, idx):
        """
        対応するidxのデータ（画像，質問，回答）を取得．

        Parameters
        ----------
        idx : int
            取得するデータのインデックス

        Returns
        -------
        image : torch.Tensor  (C, H, W)
            画像データ
        question : torch.Tensor  (vocab_size)
            質問文をone-hot表現に変換したもの
        answers : torch.Tensor  (n_answer)
            10人の回答者の回答のid
        mode_answer_idx : torch.Tensor  (1)
            10人の回答者の回答の中で最頻値の回答のid
        """
        image = Image.open(f"{self.image_dir}/{self.df['image'][idx]}")
        image = self.transform(image)
        question = np.zeros(len(self.idx2question) + 1)  # 未知語用の要素を追加
        question_words = self.df["question"][idx].split(" ")
        for word in question_words:
            word = process_question(word)
            try:
                # トークナイザーを使って単語をIDに変換　To do
                question[self.question2idx[word]] = 1  # one-hot表現に変換
            except KeyError:
                question[-1] = 1  # 未知語
        print(question)

    def __len__(self):
        return len(self.df)

    def get_class_mapping(self):
        return self.class_mapping



In [20]:
df_path="./data/train.json"
data = pandas.read_json(df_path)
data.head()
# data head = answers  data load 
# print(data.head())
for i in range(len(data["answers"])):
    colum = data["answers"][i]
    count = 0
    for j in colum:
        answer_confidence = j["answer_confidence"]
        if answer_confidence == "yes":
            count +=1
    if count == 0:
        print(colum)

answer_confidenceがyesでないものはないことがわかった。

In [22]:
for i in range(len(data["answers"])):
    colum = data["answers"][i]
    count = 0
    for j in colum:
        answer_confidence = j["answer_confidence"]
        if answer_confidence == "yes":
            count +=1
    if count <2:
        print(colum)

[{'answer_confidence': 'no', 'answer': 'red daisies'}, {'answer_confidence': 'maybe', 'answer': 'crepe myrtle'}, {'answer_confidence': 'no', 'answer': 'peony'}, {'answer_confidence': 'maybe', 'answer': 'poppies'}, {'answer_confidence': 'no', 'answer': 'poppy'}, {'answer_confidence': 'yes', 'answer': 'pink'}, {'answer_confidence': 'maybe', 'answer': 'carnations'}, {'answer_confidence': 'no', 'answer': 'red'}, {'answer_confidence': 'maybe', 'answer': 'kalanchoe'}, {'answer_confidence': 'no', 'answer': 'peonies'}]
[{'answer': 'unanswerable', 'answer_confidence': 'no'}, {'answer': 'foreign language', 'answer_confidence': 'maybe'}, {'answer': 'unanswerable', 'answer_confidence': 'no'}, {'answer': 'instructions', 'answer_confidence': 'maybe'}, {'answer': 'french', 'answer_confidence': 'maybe'}, {'answer': 'unanswerable', 'answer_confidence': 'no'}, {'answer': 'unanswerable', 'answer_confidence': 'no'}, {'answer': 'spanish', 'answer_confidence': 'maybe'}, {'answer': 'unanswerable', 'answer_co

In [25]:
df_path="./data/train.json"
data = pandas.read_json(df_path)
for i in range(10):
    colum = data["answers"][i]
    count = 0
    for j in colum:        
        answers = [answer for answer in colum if answer["answer_confidence"] == "yes"]
        print(answers)


[{'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'flat iron beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'steak'}, {'answer_confidence': 'yes', 'answer': 'flat iron beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}]
[{'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'flat iron beef chuck steak'}, {'answer_confidence': 'yes', 'answer': 'beef chuck steak'}, {'answer_confidence

トークナイザーの編集に関して
 

In [26]:
import re

def tokenizer(word_list):
    tokens = []
    for word in word_list:
        # 正規表現を使用して単語を抽出
        split_tokens = re.findall(r'\b\w+\b', word)
        tokens.extend(split_tokens)
    return tokens

# 入力リスト
word_list = ['What', 'color', 'is', 'this', 'bag?', 'It', 'takes', 'a', 'little', 'while.', 'Well,', 'because', 'I', 'just', 'asked', 'it', 'what', 'color.']

# トークナイズ実行
tokens = tokenizer(word_list)

# 結果の表示
print(tokens)

['What', 'color', 'is', 'this', 'bag', 'It', 'takes', 'a', 'little', 'while', 'Well', 'because', 'I', 'just', 'asked', 'it', 'what', 'color']


In [28]:
from sklearn.feature_extraction.text import CountVectorizer

# 入力リスト
word_list = ['What', 'color', 'is', 'this', 'bag?', 'It', 'takes', 'a', 'little', 'while.', 'Well,', 'because', 'I', 'just', 'asked', 'it', 'what', 'color.']

# 単語を結合して1つの文章にする
text = ' '.join(word_list)

# CountVectorizerを使用してベクトル化
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([text])

# 結果の表示
print("Feature names:", vectorizer.get_feature_names_out())
print("Vectorized text:\n", X.toarray()[0])


Feature names: ['asked' 'bag' 'because' 'color' 'is' 'it' 'just' 'little' 'takes' 'this'
 'well' 'what' 'while']
Vectorized text:
 [1 1 1 2 1 2 1 1 1 1 1 2 1]


In [30]:
from transformers import BertTokenizer, BertModel
import torch

# BERTトークナイザーとモデルをロード
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# 入力リスト
word_list = ['What', 'color', 'is', 'this', 'bag?', 'It', 'takes', 'a', 'little', 'while.', 'Well,', 'because', 'I', 'just', 'asked', 'it', 'what', 'color.']

# 単語を結合して1つの文章にする
text = ' '.join(word_list)

# テキストをトークナイズし、BERTモデルに入力できる形式に変換
inputs = tokenizer(text, return_tensors='pt')
print(inputs)
# BERTモデルでテキストを処理
with torch.no_grad():
    outputs = model(**inputs)

# 出力の最初のトークン([CLS]トークン)のベクトルを取得
cls_embedding = outputs.last_hidden_state[0, 0, :]

# 結果の表示
print("CLS token embedding vector:")
print(cls_embedding)


{'input_ids': tensor([[ 101, 2054, 3609, 2003, 2023, 4524, 1029, 2009, 3138, 1037, 2210, 2096,
         1012, 2092, 1010, 2138, 1045, 2074, 2356, 2009, 2054, 3609, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
CLS token embedding vector:
tensor([-2.2819e-01,  1.7407e-01,  1.5334e-01, -3.9105e-01, -5.1130e-01,
        -2.5553e-01,  8.9660e-02,  3.0045e-01,  2.6840e-01, -1.6184e-01,
         3.5439e-02, -1.2198e-01,  1.3160e-01,  2.3521e-01,  2.4242e-02,
         1.7856e-01,  2.0383e-02,  4.0325e-01,  1.2706e-01,  9.7176e-02,
        -1.6011e-02,  2.7018e-01, -2.2777e-01, -7.5980e-02,  2.1322e-01,
         1.3420e-02,  1.3515e-01, -2.0806e-02,  2.4036e-02,  7.8533e-02,
         8.5791e-02, -7.6553e-02, -4.8730e-01, -2.2779e-01,  3.2381e-01,
        -1.9414e-01,  1.3784e-01,  1.8490e-01,  1.1177e-01,  6.4059e-03,
     

In [31]:
class Tokenizer:
    def __init__(self, question2idx):
        self.question2idx = question2idx

    def process_question(self, word):
        # ここに単語を処理するコードを書く
        pass

    def tokenize(self, question_words):
        question = [0] * len(self.question2idx)  # 単語の数だけ0を持つリストを作成
        for word in question_words:
            word = self.process_question(word)
            try:
                question[self.question2idx[word]] = 1  # one-hot表現に変換
            except KeyError:
                question[-1] = 1  # 未知語
        return question


In [32]:
from transformers import BertTokenizer

class BertTokenizerWrapper:
    def __init__(self, pretrained_model_name):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name)

    def tokenize(self, text):
        # BERTのトークナイザーはテキストを直接受け取り、トークンのリストを返します
        tokens = self.tokenizer.tokenize(text)
        # トークンをIDに変換
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        return token_ids

In [34]:
tokenizer = BertTokenizerWrapper('bert-base-uncased')
tokenized_text = tokenizer.tokenize("Hello, world!")
print(tokenized_text)

[7592, 1010, 2088, 999]


In [6]:
import torch
from transformers import BertTokenizer

# トークナイザーの初期化
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# データのトークナイズと固定長へのパディング
texts = ["Example sentence one.", "Example sentence two is a bit longer."]
inputs = tokenizer(texts, padding='max_length', truncation=True, max_length=10, return_tensors='pt')
print(inputs)
# パディング（0）を除去
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
non_padded_indices = attention_mask.nonzero(as_tuple=True)
input_ids = input_ids[non_padded_indices]
attention_mask = attention_mask[non_padded_indices]

# バッチごとの最大長に再度パディング
max_len = attention_mask.sum(dim=0).max().item()
padded_input_ids = torch.nn.functional.pad(input_ids, (0, max_len - input_ids.size(0)))
padded_attention_mask = torch.nn.functional.pad(attention_mask, (0, max_len - attention_mask.size(0)))

# バッチ化
batch = {'input_ids': padded_input_ids, 'attention_mask': padded_attention_mask}
print(batch)


{'input_ids': tensor([[ 101, 2742, 6251, 2028, 1012,  102,    0,    0,    0,    0],
        [ 101, 2742, 6251, 2048, 2003, 1037, 2978, 2936, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([ 101, 2742, 6251, 2028, 1012,  102,  101, 2742, 6251, 2048, 2003, 1037,
        2978, 2936, 1012,  102]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}
