# Tutorial

In [1]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score

import re
from tqdm import tqdm

import argparse

parser = argparse.ArgumentParser(description='Tutorial')
parser.add_argument('--validation_split', default=0.2, type=float)
parser.add_argument('--seed', default=1011, type=int)
args = parser.parse_args('')

VALIDATION_SPLIT=args.validation_split
SEED=args.seed

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

set_seeds()

## Preprocessing

In [2]:
train = pd.read_csv("data/sample_train.csv")
train.head()

Unnamed: 0,code1,code2,similar
0,"flag = ""go""\ncnt = 0\nwhile flag == ""go"":\n ...",# Python 3+\n#--------------------------------...,1
1,"b, c = map(int, input().split())\n\nprint(b * c)",import numpy as np\n\nn = int(input())\na = np...,0
2,import numpy as np\nimport sys\nread = sys.std...,"N, M = map(int, input().split())\nif M%2 != 0:...",0
3,"b, c = map(int, input().split())\n\nprint(b * c)","n,m=map(int,input().split())\nh=list(map(int,i...",0
4,s=input()\nt=input()\nans=0\nfor i in range(le...,"import math\na,b,h,m=map(int,input().split())\...",0


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17970 entries, 0 to 17969
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   code1    17970 non-null  object
 1   code2    17970 non-null  object
 2   similar  17970 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 421.3+ KB


In [4]:
train["similar"].value_counts()

1    9005
0    8965
Name: similar, dtype: int64

In [5]:
test = pd.read_csv("data/test.csv")
test.head()

Unnamed: 0,pair_id,code1,code2
0,1,def main():\n s = input()\n if s.count('a') ...,"N,K = map(int,input().split())\nA = list(map(i..."
1,2,"N,K,Q = map(int,input().split())\npoints = [0]...","N, K, Q = map(int,input().split())\n\nif K > Q..."
2,3,from itertools import combinations\nn = int(in...,s = input()\nt = input()\nlength_s = len(s)\nl...
3,4,"a,b=map(int,input().split())\n\nans1=a+b\nans2...","a, b, c, d = map(int,input().split())\n\nif a ..."
4,5,S = input()\nK = int(input())\n\nind = -1\nfor...,"H, W = map(int, input().split())\ngrid = []\nf..."


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179700 entries, 0 to 179699
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   pair_id  179700 non-null  int64 
 1   code1    179700 non-null  object
 2   code2    179700 non-null  object
dtypes: int64(1), object(2)
memory usage: 4.1+ MB


In [7]:
test["pair_id"].nunique()

179700

## Modelling

In [8]:
class BaselineModel():
    def __init__(self, vectorizer, threshold=0.5):
        super(BaselineModel, self).__init__()
        self.threshold = threshold 
        self.vectorizer = vectorizer
    def fit(self, code1, code2):
        self.vectorizer.fit(code1)
        self.vectorizer.fit(code2)
        print('Done.')
    def predict_proba(self, code1, code2):
        code1_vecs = self.vectorizer.transform(code1)
        code2_vecs = self.vectorizer.transform(code2)
        preds = []
        for code1_vec, code2_vec in zip(code1_vecs, code2_vecs):
            preds.append(cosine_similarity(code1_vec, code2_vec))
        preds = np.reshape(preds, len(preds))
        print('Done.')
        return preds
    def predict(self, code1, code2):
        preds = self.predict_proba(code1, code2)
        preds = np.where(preds>self.threshold, 1, 0)
        return preds

## Training

In [9]:
%%time

print("CountVectorizer")
model = BaselineModel(CountVectorizer(), threshold=0.5)
model.fit(train['code1'], train['code2'])
preds = model.predict(train['code1'], train['code2'])
accuracy_score(train["similar"], preds)

CountVectorizer
Done.
Done.
CPU times: total: 11.4 s
Wall time: 11.4 s


0.6850306065664997

In [10]:
%%time

print("HashingVectorizer")
model = BaselineModel(HashingVectorizer(), threshold=0.5)
model.fit(train['code1'], train['code2'])
preds = model.predict(train['code1'], train['code2'])
accuracy_score(train["similar"], preds)

HashingVectorizer
Done.
Done.
CPU times: total: 44.5 s
Wall time: 44.6 s


0.6815804117974402

## Inference

In [11]:
%%time

model = BaselineModel(CountVectorizer(), threshold=0.5)
model.fit(test['code1'], test['code2'])
preds = model.predict(test['code1'], test['code2'])

Done.
Done.
CPU times: total: 2min 3s
Wall time: 2min 3s


In [12]:
submission = pd.read_csv("data/sample_submission.csv")
submission['similar'] = preds
submission.to_csv(f"{parser.description}.csv", index=False)