In [None]:
!pip install -U -q transformers datasets pythainlp

In [None]:
!pip install -U -q accelerate

In [31]:
import pandas as pd
import matplotlib.pyplot as plt
import random
from transformers import AutoTokenizer, AutoModelForTokenClassification, CamembertTokenizer, CamembertForSequenceClassification
import re
from pythainlp.tokenize import word_tokenize
import torch
from tqdm import tqdm
import random
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
from transformers import TrainingArguments
from transformers import EarlyStoppingCallback, IntervalStrategy
from sklearn.metrics import accuracy_score, f1_score
from transformers import Trainer

In [32]:
ner_tokenizer = AutoTokenizer.from_pretrained(
    "pythainlp/thainer-corpus-v2-base-model"
)

ner_model = AutoModelForTokenClassification.from_pretrained(
    "pythainlp/thainer-corpus-v2-base-model"
).to("cuda")

In [33]:
def fix_span_error(words,ner):
    _ner = []
    _ner=ner
    _new_tag=[]
    
    for i,j in zip(words,_ner):
        i=ner_tokenizer.decode(i)
        if i.isspace() and j.startswith("B-"):
            j="O"
        if i=='' or i=='<s>' or i=='</s>':
            continue
        if i=="<_>":
            i=" "
        _new_tag.append((i,j))

    return _new_tag

def clean_pipeline(text):
#     text = re.sub("(สามารถ|)(ลง|)ลาย(มือ|)ชื่อ(ของ|)", "", text)
    text = text.replace("สํา", "สำ")
    text = re.sub("(และ|)ประทับตราสำคัญของบริษัท", "", text)
    text = re.sub("(และ|พร้อม|)ประทับตราสำคัญของบริษัท", "", text)
    text = re.sub("(และ|)ประทับตราบริษัท(เป็นสำคัญ|)", "", text)
    text = re.sub("(และ|)ผูกพันบริษัท", "", text)
    
    return text

def get_ner_tag(text: str):
    cut=word_tokenize(text.replace(" ", "<_>"))
    inputs=ner_tokenizer(cut,is_split_into_words=True,return_tensors="pt").to("cuda")

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    # forward pass
    outputs = ner_model(ids, attention_mask=mask)
    logits = outputs[0]

    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [ner_model.config.id2label[t.item()] for t in predictions[0]]

    ner_tag=fix_span_error(inputs['input_ids'][0],predicted_token_class)
    return ner_tag

def clean_ner_tag(ner_tag):
    # Clean PERSON tag
    new_ner_tag = []
    prev_tag = ner_tag[0][1]

    for i, tag in enumerate(ner_tag):
        new_tag = tag[1]
        if tag[1] == "I-PERSON" and not (prev_tag == "B-PERSON" or prev_tag == "I-PERSON"):
            new_tag = "O"
        if tag[1] == "O" and (prev_tag == "B-PERSON" or prev_tag == "I-PERSON") and i+1 < len(ner_tag):
            if ner_tag[i+1][1] == "I-PERSON":
                new_tag = "I-PERSON"
            
        new_ner_tag.append((tag[0], new_tag))
        prev_tag = new_tag

    return new_ner_tag

def get_symbolic_context(ner_tag) -> str:
    last_context = ""
    prev = "O"
    current = []
    organization_word = []
    
    for tag in new_ner_tag:
        if tag[1][1:] == "ORGANIZATION":
            organization_word.append(tag[1][1:])
            
        if tag[1] == "B-PERSON":
            if prev == "I-PERSON":
                last_context += "</p>"
            last_context += "<p>" + tag[0]
            continue
        
        if tag[1] != prev and prev == "I-PERSON":
            last_context += "</p>"
        last_context += tag[0]
        prev = tag[1]
    
    if prev == "I-PERSON":
        last_context += "</p>"
    
    for organization in organization_word:
        last_context = last_context.replace(organization, "")
    return last_context

def get_clean_context_and_people_list(context):
    context = re.sub(r"</p>(.{0,8})<p>", r"\1", context)
    people_list = re.findall("<p>(.*?)</p>", context)
    context = re.sub("(<p>.*?</p>)", "<p>", context)
    context = re.sub("</p>", "", context)
    
    return context, people_list

def get_name_from_committee(rgno, committee_df):
    sample = committee_df[committee_df['rgno'] == rgno]
    names = []
    
    for idx, row in sample.iterrows():
        names.append(row['title'] + row['fname'] + " " + row['lname'])
    name_text = "".join(names)
    return name_text

def preprocess_context(context, rgno, committee_df):
    name = get_name_from_committee(rgno, committee_df)
    context = re.sub("(กรรมการคนใดคนหนึ่ง|กรรมการอื่นอีก)", name, context)
    return context

In [34]:
df = pd.read_csv("/kaggle/input/legal-act-classification/train.csv")
df['difficultly'] = df['pattern'].apply(lambda x: str(x)[0])

commitee = pd.read_csv("/kaggle/input/legal-act-classification/committee.csv", index_col=0)

In [35]:
df

Unnamed: 0,id,rgno,context,pattern,question,legal_act,condition,answer,difficultly
0,0,1.055290e+11,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,11016,['พศิน บัวขาว'],การทำนิติกรรม สำนักงานตรวจคนเข้าเมือง,,0,1
1,1,1.055290e+11,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,11016,['นภัสกร แซ่เนี้ยว'],การทำนิติกรรม กรมทางหสวง,,0,1
2,2,1.055290e+11,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,11016,['ภูวสิษฏ์ วิภาสชีวิน'],อสังหาริมทรัพย์,,0,1
3,3,1.055290e+11,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,11016,['ธนันทิกานต์ ราชาเดช'],การทำนิติกรรม หน่วยราชการวิสาหกิจ,,0,1
4,4,1.055290e+11,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,11016,['นรากรณ์ ดีเย็น'],อสังหาริมทรัพย์,,0,1
...,...,...,...,...,...,...,...,...,...
4424,4424,1.055260e+11,นายอุกฤษฏ์ ดำแดง หรือนายฐีรชัย พรสุริยะศักดิ์ ...,20010,"['อานนท์ แซ่อึ่ง', 'ฐีรชัย พรสุริยะศักดิ์', 'ธ...",ใบอนุญาตจำหน่ายสุราและจำหน่ายยาสูบ,,1,2
4425,4425,1.055260e+11,นายอุกฤษฏ์ ดำแดง หรือนายฐีรชัย พรสุริยะศักดิ์ ...,20010,"['อานนท์ แซ่อึ่ง', 'อุกฤษฏ์ ดำแดง', 'ธีรพล แสง...",การทำนิติกรรม กรมศุลกากร,,1,2
4426,4426,1.055260e+11,นายอุกฤษฏ์ ดำแดง หรือนายฐีรชัย พรสุริยะศักดิ์ ...,20010,"['อุกฤษฏ์ ดำแดง', 'ฐีรชัย พรสุริยะศักดิ์', 'ธี...",การทำนิติกรรมผูกพันส่วนงานราชการ รัฐวิสาหกิจ,,0,2
4427,4427,1.055260e+11,นายอุกฤษฏ์ ดำแดง หรือนายฐีรชัย พรสุริยะศักดิ์ ...,20010,"['อานนท์ แซ่อึ่ง', 'ฐีรชัย พรสุริยะศักดิ์', 'ธ...",การทำนิติกรรม การนิคมอุตสาหกรรม,,1,2


In [36]:
"""

    Preprocess Training data

"""

train_data_list = []
train_answers_list = []

for idx in tqdm(range(len(df))):
    try: 
        context = df.iloc[idx].context
        rgno = df.iloc[idx].rgno
#         context = preprocess_context(context, rgno, commitee)
        ner_tag = get_ner_tag(context)
        new_ner_tag = clean_ner_tag(ner_tag)
        
    except RuntimeError as E:
        print(E)
        continue
    
    last_context = get_symbolic_context(new_ner_tag)
    last_context, people_list = get_clean_context_and_people_list(last_context)
    
    num_p_tag = last_context.count("<p>")
    inputs = df.iloc[idx].question
    inputs = inputs.replace("[", "").replace("]", "").replace("\'", "")
    inputs = inputs.split(", ")
    
    people_joined = []
    blacklist = []
    for i in range(num_p_tag):
        count = 0
        for name in inputs:
            name = name.replace(" ", "")
            if name in people_list[i].replace(" ", "") and name not in blacklist:
                count += 1
                people_joined.append(name)
                blacklist.append(name)

        _idx = last_context.find("<p>")
        if count == 0:
            last_context = last_context[:_idx] + f" <ไม่มีผู้ใดลงนามได้> " + last_context[_idx+len("<p>"):]
        else:
            last_context = last_context[:_idx] + f" <ลงนาม{count}คน> " + last_context[_idx+len("<p>"):]
    
    non_people = len([name for name in inputs if name.replace(" ", "") not in blacklist])
    
    if non_people != 0:
        last_context += f" <ไม่ใช่กรรมการ {non_people} คน>"
    
    #if not len(people_joined):
    last_context += f" <{len(inputs)}คน>#" + str(df.iloc[idx].legal_act)

    last_context = re.sub(" +", " ", last_context.strip())
    train_data_list.append(last_context)
    train_answers_list.append(df.iloc[idx].answer)

 46%|████▋     | 2057/4429 [00:45<01:27, 27.05it/s]

The expanded size of the tensor (668) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 668].  Tensor sizes: [1, 512]
The expanded size of the tensor (668) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 668].  Tensor sizes: [1, 512]
The expanded size of the tensor (668) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 668].  Tensor sizes: [1, 512]


 49%|████▉     | 2165/4429 [00:49<01:06, 34.12it/s]

The expanded size of the tensor (1150) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 1150].  Tensor sizes: [1, 512]
The expanded size of the tensor (1150) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 1150].  Tensor sizes: [1, 512]
The expanded size of the tensor (1150) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 1150].  Tensor sizes: [1, 512]
The expanded size of the tensor (1150) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 1150].  Tensor sizes: [1, 512]
The expanded size of the tensor (1150) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 1150].  Tensor sizes: [1, 512]
The expanded size of the tensor (1150) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 1150].  Tensor sizes: [1, 512]


 68%|██████▊   | 2996/4429 [01:19<00:41, 34.51it/s]

The expanded size of the tensor (866) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 866].  Tensor sizes: [1, 512]
The expanded size of the tensor (866) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 866].  Tensor sizes: [1, 512]
The expanded size of the tensor (866) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 866].  Tensor sizes: [1, 512]
The expanded size of the tensor (866) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 866].  Tensor sizes: [1, 512]
The expanded size of the tensor (866) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 866].  Tensor sizes: [1, 512]
The expanded size of the tensor (866) must match the existing size (512) at non-singleton dimension 1.  Target sizes: [1, 866].  Tensor sizes: [1, 512]


100%|██████████| 4429/4429 [01:45<00:00, 41.79it/s]


In [37]:
clean_data_list = [text.replace("พรมฝ้าย", "") for text in train_data_list]

In [38]:
train = pd.DataFrame({"input": clean_data_list, "output": train_answers_list})
train.to_csv("train-preprocessv5.csv", index=False)

In [39]:
train = train.drop_duplicates()

In [40]:
train

Unnamed: 0,input,output
0,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,0
1,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,0
2,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,0
3,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,0
5,กรรมการคนใดคนหนึ่งลงลายมือชื่อร่วมกับกรรมการอื...,0
...,...,...
4409,<ลงนาม1คน> กรรมการกลุ่ม 1 <ลงนาม1คน> กรรมการกล...,1
4410,<ไม่มีผู้ใดลงนามได้> กรรมการกลุ่ม 1 <ลงนาม1คน>...,1
4411,<ลงนาม1คน> กรรมการกลุ่ม 1 <ลงนาม1คน> กรรมการกล...,0
4412,<ลงนาม1คน> กรรมการกลุ่ม 1 <ลงนาม1คน> กรรมการกล...,1


In [42]:
tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")

In [17]:
train['token'] = train['input'].apply(lambda x: len(tokenizer(x)['input_ids']))

In [19]:
train = train[train['token'] < 270]

## Training

In [44]:
from transformers import XLMRobertaForSequenceClassification

In [45]:
model = XLMRobertaForSequenceClassification.from_pretrained(
    "FacebookAI/xlm-roberta-large")
# tokenizer = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large")

In [46]:
TOKEN = 270

In [47]:
class CustomImageDataset(Dataset):
    def __init__(self, dataframe, tokenizer):
        self.dataframe = dataframe
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        encode = self.tokenizer(
            self.dataframe.iloc[idx]['input'],
            max_length=TOKEN,
            truncation=True,
            padding='max_length'
        )
        label = self.dataframe.iloc[idx]['output']
        encode['labels'] = torch.tensor(label)
        return encode

In [None]:
train

In [48]:
# train = train.drop_duplicates()
# all_train = all_train.drop_duplicates()
train_dataset = CustomImageDataset(train, tokenizer)

In [50]:
# Set Batch Size
batch_size = 4
gradient_accumulation_steps=12
logging_steps = 50 #len(df) // batch_size
num_train_epochs = 6
lr_initial = 3e-5
weight_decay = 1e-3
output_dir = "modelv3"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    learning_rate=lr_initial,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    # per_device_eval_batch_size=batch_size,
    weight_decay=weight_decay,
    # evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    push_to_hub=False,
    log_level="error"
)

In [51]:
args = TrainingArguments(
   output_dir=output_dir,
   save_total_limit = 5, # Only last 5 models are saved. Older ones are deleted.
)

In [52]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [53]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

In [54]:
trainer.train()



Step,Training Loss
50,0.5403
100,0.4437
150,0.3808


TrainOutput(global_step=180, training_loss=0.4312562624613444, metrics={'train_runtime': 2349.3484, 'train_samples_per_second': 7.465, 'train_steps_per_second': 0.077, 'total_flos': 8479938343829400.0, 'train_loss': 0.4312562624613444, 'epoch': 5.901639344262295})

In [55]:
model.save_pretrained("model")

## Vis

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.manifold import TSNE

dim_reducer = TSNE(n_components=2)

def visualize_layerwise_embeddings(hidden_states,masks,labels,epoch,title,layers_to_visualize):

    !mkdir -p /tmp/plots/{title}
    num_layers = len(layers_to_visualize)
    
    fig = plt.figure(figsize=(24,(num_layers/4)*6)) #each subplot of size 6x6, each row will hold 4 plots
    print(f'Creating {num_layers} subplots')
    ax = [fig.add_subplot(num_layers//4,4,i+1) for i in range(num_layers)]
    
    labels = labels.detach().numpy().reshape(-1)
    for i,layer_i in enumerate(layers_to_visualize):
        layer_embeds = hidden_states[layer_i]
        
        layer_averaged_hidden_states = torch.div(layer_embeds.sum(dim=1),masks.sum(dim=1,keepdim=True))
        layer_dim_reduced_embeds = dim_reducer.fit_transform(layer_averaged_hidden_states.detach().numpy())
        
        df = pd.DataFrame.from_dict({'x':layer_dim_reduced_embeds[:,0],'y':layer_dim_reduced_embeds[:,1],'label':labels})
        
        sns.scatterplot(data=df,x='x',y='y',hue='label',ax=ax[i])

    
    plt.savefig(f'{title}',format='png',pad_inches=0)

In [None]:
model.eval()
model.to("cpu")

In [None]:
random_df = train.sample(frac=0.1)
random_df

In [None]:
# random_df.output.hist()

In [None]:
tokens = tokenizer(
    random_df['input'].tolist(),
    return_tensors="pt",
    max_length=TOKEN,
    truncation=True,
    padding='max_length'
)#.to("cuda")

# tokens

In [None]:
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

model_out = model(input_ids,attention_mask,output_hidden_states=True,return_dict=True)
hidden_states = model_out.hidden_states[1:]

In [None]:
visualize_layerwise_embeddings(
      hidden_states=hidden_states,
      masks=attention_mask,
      labels=torch.tensor(random_df['output'].tolist()),
      epoch=10,
      title='train_data',
      layers_to_visualize=[0, 1, 2, 3, 8, 9, 10, 11]
)

## Test

In [56]:
# model = CamembertForSequenceClassification.from_pretrained(
#     "airesearch/wangchanberta-base-att-spm-uncased")
model.eval()

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_fe

In [57]:
df = pd.read_csv("/kaggle/input/legal-act-classification/test.csv")
commitee = pd.read_csv("/kaggle/input/legal-act-classification/committee.csv", index_col=0)

In [None]:
df

In [58]:
"""

    Preprocess Training data

"""

test_data_list = []

for idx in tqdm(range(len(df))):
    try: 
        context = df.iloc[idx].context
        rgno = df.iloc[idx].rgno
#         context = preprocess_context(context, rgno, commitee)
        ner_tag = get_ner_tag(context)
        new_ner_tag = clean_ner_tag(ner_tag)
        
    except RuntimeError as E:
        inputs = df.iloc[idx].question
        inputs = inputs.replace("[", "").replace("]", "").replace("\'", "")
        inputs = inputs.split(", ")
        test_data_list.append(f"ลงนาม<{len(inputs)}คน>#" + str(df.iloc[idx].legal_act))
        continue
    
    last_context = get_symbolic_context(new_ner_tag)
    last_context, people_list = get_clean_context_and_people_list(last_context)
    
    num_p_tag = last_context.count("<p>")
    inputs = df.iloc[idx].question
    inputs = inputs.replace("[", "").replace("]", "").replace("\'", "")
    inputs = inputs.split(", ")
    
    people_joined = []
    blacklist = []
    for i in range(num_p_tag):
        count = 0
        for name in inputs:
            name = name.replace(" ", "")
            if name in people_list[i].replace(" ", "") and name not in blacklist:
                count += 1
                people_joined.append(name)
                blacklist.append(name)

        _idx = last_context.find("<p>")
        if count == 0:
            last_context = last_context[:_idx] + f" <ไม่มีผู้ใดลงนามได้> " + last_context[_idx+len("<p>"):]
        else:
            last_context = last_context[:_idx] + f" <ลงนาม{count}คน> " + last_context[_idx+len("<p>"):]
    
    non_people = len([name for name in inputs if name.replace(" ", "") not in blacklist])
    
    if non_people != 0:
        last_context += f" <ไม่ใช่กรรมการ {non_people} คน>"
    
    #if not len(people_joined):
    last_context += f" <{len(inputs)}คน>#" + str(df.iloc[idx].legal_act)

    last_context = re.sub(" +", " ", last_context.strip())
    test_data_list.append(last_context)


100%|██████████| 5835/5835 [02:28<00:00, 39.40it/s]


In [59]:
clean_test_data_list = [text for text in test_data_list]

In [60]:
pd.DataFrame({"input": clean_test_data_list}).to_csv("test-pattern5.csv")

In [61]:
model.eval()
model.to("cuda")

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_fe

In [62]:
prediction = []
batch = 2
for i in tqdm(range(0, len(clean_test_data_list), batch)):
    tokens = tokenizer(
        clean_test_data_list[i : i + batch],
        return_tensors="pt",
        max_length=TOKEN,
        truncation=True,
        padding='max_length'
    ).to("cuda")

    outputs = model(**tokens)
    preds = torch.argmax(outputs.logits, dim=1)
    prediction.extend(preds.detach().cpu().numpy().tolist())

100%|██████████| 2918/2918 [05:13<00:00,  9.32it/s]


In [64]:
submission = pd.read_csv("/kaggle/input/legal-act-classification/sample_submission.csv")

In [65]:
submission['answer'].iloc[3:] = prediction[3:]

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  submission['answer'].iloc[3:] = prediction[3:]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['answ

In [66]:
submission['answer'].value_counts()

answer
0.0    3414
1.0    2421
Name: count, dtype: int64

In [67]:
submission.to_csv("submission.csv", index=False)