# Baseline

In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_addons as tfa
import tensorflow_hub as hub

import transformers
from transformers import AutoTokenizer
transformers.logging.set_verbosity_error()

from glob import glob
from tqdm import tqdm

import argparse
# import wandb
# from wandb.keras import WandbCallback
# wandb.init(project="DACON_235900", name="Baseline")

parser = argparse.ArgumentParser(description='Baseline')
parser.add_argument('--max_length', default=128, type=int)
parser.add_argument('--optimizer', default="sgd", type=str) # sgd or adam
parser.add_argument('--learning_rate', default=0.01, type=float)
parser.add_argument('--batch_size', default=32, type=int)
parser.add_argument('--epochs', default=100, type=int)
parser.add_argument('--validation_split', default=0.2, type=float)
parser.add_argument('--seed', default=1011, type=int)
args = parser.parse_args('')

# wandb.config.update(args)

MAX_LENGTH = args.max_length
BATCH_SIZE=args.batch_size
EPOCHS=args.epochs
VALIDATION_SPLIT=args.validation_split
SEED=args.seed

# lr = tf.keras.optimizers.schedules.CosineDecay(args.learning_rate, decay_steps=1000)
# if args.optimizer == "sgd":
#     optim = tf.keras.optimizers.SGD(learning_rate=lr, momentum=0.95)
# elif args.optimizer == "adam":
#     optim = tf.keras.optimizers.Adam(learning_rate=lr)

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)

set_seeds()

## Preprocessing

In [2]:
train = pd.read_csv("data/sample_train.csv")
train.head()

Unnamed: 0,code1,code2,similar
0,"flag = ""go""\ncnt = 0\nwhile flag == ""go"":\n ...",# Python 3+\n#--------------------------------...,1
1,"b, c = map(int, input().split())\n\nprint(b * c)",import numpy as np\n\nn = int(input())\na = np...,0
2,import numpy as np\nimport sys\nread = sys.std...,"N, M = map(int, input().split())\nif M%2 != 0:...",0
3,"b, c = map(int, input().split())\n\nprint(b * c)","n,m=map(int,input().split())\nh=list(map(int,i...",0
4,s=input()\nt=input()\nans=0\nfor i in range(le...,"import math\na,b,h,m=map(int,input().split())\...",0


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17970 entries, 0 to 17969
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   code1    17970 non-null  object
 1   code2    17970 non-null  object
 2   similar  17970 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 421.3+ KB


In [4]:
train["similar"].value_counts()

1    9005
0    8965
Name: similar, dtype: int64

In [5]:
test = pd.read_csv("data/test.csv")
test.head()

Unnamed: 0,pair_id,code1,code2
0,1,def main():\n s = input()\n if s.count('a') ...,"N,K = map(int,input().split())\nA = list(map(i..."
1,2,"N,K,Q = map(int,input().split())\npoints = [0]...","N, K, Q = map(int,input().split())\n\nif K > Q..."
2,3,from itertools import combinations\nn = int(in...,s = input()\nt = input()\nlength_s = len(s)\nl...
3,4,"a,b=map(int,input().split())\n\nans1=a+b\nans2...","a, b, c, d = map(int,input().split())\n\nif a ..."
4,5,S = input()\nK = int(input())\n\nind = -1\nfor...,"H, W = map(int, input().split())\ngrid = []\nf..."


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179700 entries, 0 to 179699
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   pair_id  179700 non-null  int64 
 1   code1    179700 non-null  object
 2   code2    179700 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.1+ MB


In [7]:
test["pair_id"].nunique()

179700

## EDA

In [8]:
# code_folder = 'data/code'
# problem_folders = os.listdir(code_folder)
# len(problem_folders)

In [9]:
# def preprocess_script(script):
#     '''
#     간단한 전처리 함수
#     주석 -> 삭제
#     '    '-> tab 변환
#     다중 개행 -> 한 번으로 변환
#     '''
#     with open(script,'r',encoding='utf-8') as file:
#         lines = file.readlines()
#         preproc_lines = []
#         for line in lines:
#             if line.lstrip().startswith('#'):
#                 continue
#             line = line.rstrip()
#             if '#' in line:
#                 line = line[:line.index('#')]
#             line = line.replace('\n','')
#             line = line.replace('    ','\t')
#             if line == '':
#                 continue
#             preproc_lines.append(line)
#         preprocessed_script = '\n'.join(preproc_lines)
#     return preprocessed_script

# preproc_scripts = []
# problem_nums = []

# for problem_folder in tqdm(problem_folders):
#     scripts = os.listdir(os.path.join(code_folder,problem_folder))
#     problem_num = scripts[0].split('_')[0]
#     for script in scripts:
#         script_file = os.path.join(code_folder,problem_folder,script)
#         preprocessed_script = preprocess_script(script_file)

#         preproc_scripts.append(preprocessed_script)
#     problem_nums.extend([problem_num]*len(scripts))

In [10]:
# len(problem_nums)

In [11]:
# df = pd.DataFrame({'code': preproc_scripts,
#                    'problem_num': problem_nums})
# df.head()

In [12]:
# from transformers import AutoTokenizer

# # tokenizer = AutoTokenizer.from_pretrained("microsoft/graphcodebert-base")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# df['tokens'] = df['code'].apply(tokenizer.tokenize)
# df['len'] = df['tokens'].apply(len)
# df.describe()

In [13]:
# ndf = df[df['len'] <= 256].reset_index(drop=True)
# ndf.describe()

In [14]:
# ndf['problem_num'].value_counts()

In [15]:
# train_df, valid_df, train_label, valid_label = train_test_split(
#     ndf,
#     ndf['problem_num'],
#     test_size=VALIDATION_SPLIT,
#     random_state=SEED,
#     stratify=ndf['problem_num'],
# )

# train_df = train_df.reset_index(drop=True)
# valid_df = valid_df.reset_index(drop=True)

In [16]:
# from rank_bm25 import BM25Okapi
# from itertools import combinations

# codes = train_df['code'].to_list()
# problems = train_df['problem_num'].unique().tolist()
# problems.sort()

# tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
# bm25 = BM25Okapi(tokenized_corpus)

# total_positive_pairs = []
# total_negative_pairs = []

# for problem in tqdm(problems):
#     solution_codes = train_df[train_df['problem_num'] == problem]['code']
#     positive_pairs = list(combinations(solution_codes.to_list(),2))

#     solution_codes_indices = solution_codes.index.to_list()
#     negative_pairs = []

#     first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
#     negative_code_scores = bm25.get_scores(first_tokenized_code)
#     negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
#     ranking_idx = 0
    
#     for solution_code in solution_codes:
#         negative_solutions = []
#         while len(negative_solutions) < len(positive_pairs) // len(solution_codes):
#             high_score_idx = negative_code_ranking[ranking_idx]
            
#             if high_score_idx not in solution_codes_indices:
#                 negative_solutions.append(train_df['code'].iloc[high_score_idx])
#             ranking_idx += 1

#         for negative_solution in negative_solutions:
#             negative_pairs.append((solution_code, negative_solution))
    
#     total_positive_pairs.extend(positive_pairs)
#     total_negative_pairs.extend(negative_pairs)

# pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
# pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

# neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
# neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

# pos_label = [1]*len(pos_code1)
# neg_label = [0]*len(neg_code1)

# pos_code1.extend(neg_code1)
# total_code1 = pos_code1
# pos_code2.extend(neg_code2)
# total_code2 = pos_code2
# pos_label.extend(neg_label)
# total_label = pos_label
# pair_data = pd.DataFrame(data={
#     'code1':total_code1,
#     'code2':total_code2,
#     'similar':total_label
# })
# pair_data = pair_data.sample(frac=1).reset_index(drop=True)

# pair_data.to_csv('train_data.csv', index=False)

# pair_data.head()

### Load Data

In [17]:
# pair_data = pd.read_csv('train_data.csv')
# pair_data.info()

## Modelling

In [18]:
# class BaselineModel():
#     def __init__(self, threshold=0.5):
#         super(BaselineModel, self).__init__()
#         self.threshold = threshold 
#         self.vectorizer = CountVectorizer()
#     def fit(self, code1, code2):
#         self.vectorizer.fit(code1)
#         self.vectorizer.fit(code2)
#         print('Done.')
#     def predict_proba(self, code1, code2):
#         code1_vecs = self.vectorizer.transform(code1)
#         code2_vecs = self.vectorizer.transform(code2)
#         preds = []
#         for code1_vec, code2_vec in zip(code1_vecs, code2_vecs):
#             preds.append(cosine_similarity(code1_vec, code2_vec))
#         preds = np.reshape(preds, len(preds))
#         print('Done.')
#         return preds
#     def predict(self, code1, code2):
#         preds = self.predict_proba(code1, code2)
#         preds = np.where(preds>self.threshold, 1, 0)
#         return preds

In [19]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence):
    
    def __init__(
        self,
        sentence_pairs,
        labels=[0, 1],
        batch_size=BATCH_SIZE,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
        # Load our BERT Tokenizer to encode the text.
        # We will use base-base-uncased pretrained model.
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=MAX_LENGTH,
            return_attention_mask=True,
            return_token_type_ids=True,
            padding=True,
            return_tensors="tf",
        )

        # Convert batch of encoded features to numpy array.
        input_ids = np.array(encoded["input_ids"], dtype="int32")
        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

        # Set to true if data generator is used for training/validation.
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return [input_ids, attention_masks, token_type_ids], labels
        else:
            return [input_ids, attention_masks, token_type_ids]

    def on_epoch_end(self):
        # Shuffle indexes after each epoch if shuffle is set to True.
        if self.shuffle:
            np.random.RandomState(SEED).shuffle(self.indexes)

In [20]:
input_ids = tf.keras.layers.Input(
    shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids"
)
# Attention masks indicates to the model which tokens should be attended to.
attention_masks = tf.keras.layers.Input(
    shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_masks"
)
# Token type ids are binary masks identifying different sequences in the model.
token_type_ids = tf.keras.layers.Input(
    shape=(MAX_LENGTH,), dtype=tf.int32, name="token_type_ids"
)
# Loading pretrained BERT model.
bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
# Freeze the BERT model to reuse the pretrained features without modifying them.
bert_model.trainable = False

bert_output = bert_model(
    input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
)
sequence_output = bert_output.last_hidden_state
pooled_output = bert_output.pooler_output
# Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
bi_lstm = tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(64, return_sequences=True)
)(sequence_output)
# Applying hybrid pooling approach to bi_lstm sequence output.
avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
concat = tf.keras.layers.concatenate([avg_pool, max_pool])
dropout = tf.keras.layers.Dropout(0.3)(concat)
output = tf.keras.layers.Dense(1, activation="sigmoid")(dropout)
model = tf.keras.models.Model(
    inputs=[input_ids, attention_masks, token_type_ids], outputs=output
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss="binary_crossentropy",
    metrics=["acc"],
)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_masks (InputLayer)   [(None, 128)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_masks[0][0]',    

In [21]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
train['tokens1'] = train['code1'].apply(tokenizer.tokenize)
train['len1'] = train['tokens1'].apply(len)
train['tokens2'] = train['code2'].apply(tokenizer.tokenize)
train['len2'] = train['tokens2'].apply(len)
train.describe()

Unnamed: 0,similar,len1,len2
count,17970.0,17970.0,17970.0
mean,0.501113,159.671508,164.177073
std,0.500013,178.446994,232.938394
min,0.0,10.0,10.0
25%,0.0,64.0,64.0
50%,1.0,109.0,110.0
75%,1.0,202.0,203.0
max,1.0,7483.0,14271.0


In [22]:
ndf = train[(train['len1'] <= 512) & (train['len2'] <= 512)].reset_index(drop=True)
ndf.describe()

Unnamed: 0,similar,len1,len2
count,16916.0,16916.0,16916.0
mean,0.499231,138.070939,138.490305
std,0.500014,103.186325,103.756053
min,0.0,10.0,10.0
25%,0.0,63.0,62.0
50%,0.0,103.0,104.0
75%,1.0,187.0,186.0
max,1.0,512.0,512.0


In [23]:
X = ndf[["code1", "code2"]]
y = ndf["similar"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VALIDATION_SPLIT, random_state=SEED, stratify=y)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((13532, 2), (3384, 2), (13532,), (3384,))

In [24]:
# X = pair_data[["code1", "code2"]]
# y = pair_data["similar"]

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VALIDATION_SPLIT, random_state=SEED, stratify=y)

# X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [25]:
train_data = BertSemanticDataGenerator(
    X_train[["code1", "code2"]].values.astype("str"),
    y_train.values,
    batch_size=BATCH_SIZE,
    shuffle=True,
)
valid_data = BertSemanticDataGenerator(
    X_val[["code1", "code2"]].values.astype("str"),
    y_val.values,
    batch_size=BATCH_SIZE,
    shuffle=False,
)

In [None]:
callback = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
    ),
    tf.keras.callbacks.ModelCheckpoint(
        checkpoint_path,
        monitor="val_loss",
        save_best_only=True,
        save_weights_only=True,
    )
]


history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=EPOCHS,
)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

In [None]:
# Unfreeze the bert_model.
bert_model.trainable = True
# Recompile the model to make the change effective.
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-5),
    loss="binary_crossentropy",
    metrics=["acc"],
)
model.summary()

In [None]:
history = model.fit(
    train_data,
    validation_data=valid_data,
    epochs=EPOCHS,
)

In [None]:
# def check_similarity(sentence1, sentence2):
#     sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
#     test_data = BertSemanticDataGenerator(
#         sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
#     )

#     proba = model.predict(test_data[0])[0]
#     idx = np.argmax(proba)
#     proba = f"{proba[idx]: .2f}%"
#     pred = labels[idx]
#     return pred, proba

## Training

In [None]:
%%time

from sklearn.feature_extraction.text import HashingVectorizer

class BaselineModel():
    def __init__(self, threshold=0.5):
        super(BaselineModel, self).__init__()
        self.threshold = threshold 
        self.vectorizer = HashingVectorizer()
    def fit(self, code1, code2):
        self.vectorizer.fit(code1)
        self.vectorizer.fit(code2)
        print('Done.')
    def predict_proba(self, code1, code2):
        code1_vecs = self.vectorizer.transform(code1)
        code2_vecs = self.vectorizer.transform(code2)
        preds = []
        for code1_vec, code2_vec in zip(code1_vecs, code2_vecs):
            preds.append(cosine_similarity(code1_vec, code2_vec))
        preds = np.reshape(preds, len(preds))
        print('Done.')
        return preds
    def predict(self, code1, code2):
        preds = self.predict_proba(code1, code2)
        preds = np.where(preds>self.threshold, 1, 0)
        return preds
    
model = BaselineModel(threshold=0.5)
model.fit(pair_data['code1'], pair_data['code2'])
preds = model.predict(pair_data['code1'], pair_data['code2'])
accuracy_score(pair_data["similar"], preds)

In [None]:
model = BaselineModel(threshold=0.5)
model.fit(train['code1'], train['code2'])
preds = model.predict(train['code1'], train['code2'])
accuracy_score(train["similar"], preds)

In [None]:
X = train[["code1", "code2"]]
y = train["similar"]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VALIDATION_SPLIT, random_state=SEED, stratify=y)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

X.shape, y.shape

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=VALIDATION_SPLIT, random_state=SEED, stratify=y)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [None]:
model = BaselineModel(threshold=0.5)
model.fit(X_train['code1'], X_train['code2'])
preds = model.predict(X_val['code1'], X_val['code2'])
accuracy_score(y_val, preds)

In [None]:
model = BaselineModel(threshold=0.5)
model.fit(train['code1'], train['code2'])
preds = model.predict(test['code1'], test['code2'])
np.sum(preds)

## Inference

In [None]:
submission = pd.read_csv("data/sample_submission.csv")
submission['similar'] = preds
submission.to_csv(f"{parser.description}.csv", index=False)

## etc.

In [None]:
base_model = tf.keras.applications.Xception(
    include_top=False,
    weights=None,
    input_shape=input_shape,
)

inp = tf.keras.Input(shape=input_shape)
x = base_model(inp)
# x = se_block(x)
# x = cbam_block(x)
# x = layers.Dropout(0.2)(x)
x = layers.GlobalAveragePooling2D()(x)
oup = layers.Dense(len(np.unique(y)), activation="softmax")(x)
model = tf.keras.Model(inputs=inp, outputs=oup)

model.summary()

## Training

In [None]:
train_ds = (
    tf.data.Dataset.from_tensor_slices((X_train, y_train))
    .shuffle(len(X_train))
    .batch(BATCH_SIZE)
    .map(lambda x, y: (tf.py_function(augment, [x], [tf.float32])[0], y),
         num_parallel_calls=tf.data.experimental.AUTOTUNE)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

val_ds = (
    tf.data.Dataset.from_tensor_slices((X_val, y_val))
    .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

In [None]:
model.compile(
    optimizer=optim,
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

checkpoint_filepath=f"load_model/{parser.description}"

checkpoint_callback = [
    tf.keras.callbacks.ModelCheckpoint(
        checkpoint_filepath,
        monitor="val_loss",
        save_best_only=True,
        save_weights_only=True
    )
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback, WandbCallback()],
)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss=history.history['loss']
val_loss=history.history['val_loss']

plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')
plt.show()

plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
model.load_weights(checkpoint_filepath)  