In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import string

In [None]:
root_path = "/home/stefan/ioai-prep/kits/essay-gap"

# Dataset

In [3]:
stop_words = set(stopwords.words("english"))
to_remove = stop_words | set(string.punctuation)

lemmatizer = WordNetLemmatizer()

def clean_text(text: str):
    text = text.lower()
    tok = nltk.word_tokenize(text)
    tok = [lemmatizer.lemmatize(t) for t in tok if t not in to_remove]
    return ' '.join(tok)

clean_text("Ana has apples.")

'ana apple'

In [4]:
train_df = pd.read_csv(f"{root_path}/train.csv")

for col in train_df.select_dtypes(include='object'):
    train_df[col] = train_df[col].apply(clean_text)

train_df.head()

Unnamed: 0,sampleID,before,after,opt_0,opt_1,opt_2,opt_3,label
0,0,life cycle christmas tree seed 2-metre 7 ft tr...,one issue farmer face destruction pine tree pe...,remaining development tree greatly depends cli...,belief divinity jesus lead question `` jesus m...,essentially recipe brings together traditional...,matter debate first christmas stamp,0
1,1,slope flatter 25 degree steeper 60 degree typi...,rule thumb slope flat enough hold snow steep e...,1850 book first christmas new england harriet ...,latin america iberian peninsula midnight mass ...,steeper slope occur little 15 49 ft snow-ice,incidence human triggered avalanche normalized...,3
2,2,workplace conduct `` christmas party '' someti...,likewise school tafe vocational training unive...,many people take holiday christmas new year 's...,frequency avalanche form given area known retu...,employ on-the-ground physical measurement tech...,area around basilica begin crowd light firewor...,0
3,3,chronography 354 illuminated manuscript compil...,around 385 feast birth jesus distinct baptism ...,eastern inland region country flat well suited...,sermon delivered antioch december 25 c. 386 jo...,remains one extensive market area united kingd...,cold front leading edge cooler mass air produc...,1
4,4,english personification christmas first record...,character maintained late 18th 19th century ch...,sermon 386 gregory nyssa specifically related ...,first evidence decorated tree associated chris...,following restoration 1660 father christmas 's...,614 persian sassanid empire supported jewish r...,2


In [5]:
def common_words_cnt(text1: str, text2: str):
    words1 = set(text1.split())
    words2 = set(text2.split())
    return len(words1 & words2)

common_words_cnt("Ana has apples.", "Maria wants apples.")

1

# Submission

In [6]:
test_df = pd.read_csv(f"{root_path}/test.csv")

for col in train_df.select_dtypes(include="object"):
    test_df[col] = test_df[col].apply(clean_text)

In [7]:
answers = []

for idx in range(len(test_df)):
    row = test_df.iloc[idx]

    scores = [
        {
            "opt": i,
            "score": common_words_cnt(row["before"], row[f"opt_{i}"])
        } for i in range(4)
    ]

    ans = max(scores, key=lambda x: x["score"])["opt"]
    answers.append(ans)

In [8]:
submission = pd.DataFrame({"sampleID": test_df["sampleID"], "answer": answers})
submission.head()

Unnamed: 0,sampleID,answer
0,100,0
1,101,1
2,102,2
3,103,0
4,104,3


In [9]:
submission.to_csv("submission.csv", index=False)