In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
import pandas as pd
import os
import numpy as np
from gensim.models import KeyedVectors

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np
from string import punctuation
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
import re

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline

from tqdm import tqdm
import classla


### Read data

In [2]:
input_file = "./data/corpus.csv"
corpus = pd.read_csv("./data/corpus.csv", dtype="string")

In [3]:
corpus.head()

In [4]:
language = "SR"
data = corpus[corpus.NaturalLanguageID == language]

In [5]:
data.columns

In [6]:
# data = data[["Comment", "y8", "y6", "y2"]]

Remove ide

In [7]:
data = data[data.y8 != "ide"]

### Embeddings

In [8]:
total_tokens = 0
total_tokens_without_embedding = 0

In [9]:
def get_comment_vector(embeddings, comment):
    global total_tokens, total_tokens_without_embedding

    comment_vector = []
    num_tokens = 0

    for token in comment:
        total_tokens += 1
        try:
            if num_tokens == 0:
                comment_vector = embeddings[token]
            else:
                comment_vector = np.add(comment_vector, embeddings[token])
            num_tokens += 1
        except:
            total_tokens_without_embedding += 1
            pass
    if num_tokens == 0:
        return np.nan
    return np.asarray(comment_vector) / num_tokens

In [10]:
def vectorize(comments, embeddings):
    global total_tokens, total_tokens_without_embedding

    total_tokens = 0
    total_tokens_without_embedding = 0

    X_vectors = comments.apply(lambda comment: get_comment_vector(embeddings, comment))
    X_vectors = X_vectors.apply(pd.Series)

    nan_comments_mask = X_vectors.isnull().iloc[:, 0]
    num_nan_comments = nan_comments_mask.sum()
    print(f"{num_nan_comments} comments have no embeddings")

    print (f"Total tokens {total_tokens}, out of whicih {total_tokens_without_embedding} do not have embeddings -- {total_tokens_without_embedding/total_tokens*100:.4}%.")
    return X_vectors, nan_comments_mask


### Evaluate

In [11]:
def write_results(result_file, score_name, score_value):
    pd.DataFrame(
        {"score_name": [score_name],
        "score_value": [score_value]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

In [12]:
def make_score_name(score_name, model_name, num_classes):
    return f"{score_name}-{model_name}-{num_classes}"

In [13]:
def evaluate(data, x_column_name, y_column_name, result_file, score_name, model_name, estimator, hyper_params, embeddings):
    print(estimator)
    
    X = data[x_column_name]
    y = data[y_column_name]

    # Vectorize.
    X, nan_comments_mask = vectorize(X, embeddings)
    X = X[~nan_comments_mask]
    y = y[~nan_comments_mask]

    full_score_name = make_score_name(score_name, model_name, y.nunique())
    print(f"--------Evaluating {full_score_name} --------")
    gs_estimator = GridSearchCV(
        estimator, hyper_params, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)

    scores = cross_validate(
        gs_estimator, X, y, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)
    mean_score = np.mean(scores["test_score"])

    pd.DataFrame(
        {"score_name": [full_score_name],
        "score_value": [mean_score]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

    return mean_score

### Preprocessing

1. Tokenization

In [14]:
classla.download('sr', type='nonstandard')

In [15]:
sr_pipeline = classla.Pipeline("sr", type="nonstandard")

In [16]:
def classla_tokenize(comment):
    try:
        doc = sr_pipeline(comment)
        return list([word.text for word in doc.iter_words()])
    except:
        print(f"SR tokenize ERROR for comment: {comment}")
        return comment.split(" ")

In [17]:
tqdm.pandas()
data["classla_tokens"] = data["Comment"].progress_apply(lambda comment: classla_tokenize(comment))

In [None]:
whitespace_tokenizer = WhitespaceTokenizer()


def my_whitespace_tokenizer(comment):
    return whitespace_tokenizer.tokenize(comment)

data["whitespace_tokens"] = data["Comment"].apply(lambda comment: my_whitespace_tokenizer(comment))

In [None]:
def my_word_tokenizer(comment):
    token_pattern = re.compile(r"(?u)\b\w\w+\b")
    return token_pattern.findall(comment)

data["word_tokens"] = data["Comment"].apply(lambda comment: my_word_tokenizer(comment))

### Models

In [18]:
evaluation_params = {
    "svm": (LinearSVC(), {"C": [0.001, 0.01, 0.1, 1, 10]}), 
}

In [None]:
result_file = "./results/boe_SR.csv"

### Read embeddings cc_sh_300

In [None]:
embedding_file, vectors_name = ("embeddings/cc.sh.300.vec", "cc_sh_300")
print(f"Start reading {embedding_file}.")
embeddings_cc = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["classla_tokens", "whitespace_tokens","word_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_cc)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

2. snake_case/CamelCase/both

In [None]:
def snake_case_tokenize(tokens):
    output_tokens = []
    for token in tokens:
        output_tokens.extend(token.split("_"))

    return list(filter(None, output_tokens))

In [None]:
data["snake_classla_tokens"] = data["classla_tokens"].apply(lambda tokens: snake_case_tokenize(tokens))

In [None]:
def camel_case_tokenize(tokens):
    try:
        output_tokens = []
        for token in tokens:
            if len(token) == 0:
                continue
            new_tokens = []
            new_tokens.append(str(token[0]))
            for c in token[1:]:
                if new_tokens[-1][-1].islower() and c.isupper():
                    new_tokens.append(str(c))
                else:
                    new_tokens[-1] += c

            output_tokens.extend(new_tokens)

        return list(filter(None, output_tokens))
    except:
        print("-------------- CAMEL CASE ERROR ------------")
        print(tokens)
        return tokens

In [None]:
data["camel_classla_tokens"] = data["classla_tokens"].apply(lambda tokens: camel_case_tokenize(tokens))

In [None]:
data["snake_camel_classla_tokens"] = data["snake_classla_tokens"].apply(lambda tokens: camel_case_tokenize(tokens))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["snake_classla_tokens", "camel_classla_tokens","snake_camel_classla_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_cc)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

3. Stemming/Lemmatization

In [None]:
stemmed_df = pd.read_csv("data/tokens/SR_classla_stemmed.csv", header=0, dtype="string")

In [None]:
stemmed_df.index = data.index

In [None]:
data["stemmed_classla_tokens"] = stemmed_df["stemmed_classla_tokens"].apply(lambda comment: comment.split(" "))

In [None]:
sr_pretokenized_pipeline = classla.Pipeline(
    "sr", type="nonstandard", tokenize_pretokenized=True)

In [None]:
def lemma_sr(tokens):
    try:
        doc = sr_pretokenized_pipeline(" ".join(tokens))
        return list([word.lemma for word in doc.iter_words()])
    except:
        print(f"Lema SR error for tokens: {tokens}")
        return tokens

In [None]:
tqdm.pandas()
data["lema_classla_tokens"] = data["classla_tokens"].progress_apply(lambda tokens: lemma_sr(tokens))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["stemmed_classla_tokens", "lema_classla_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_cc)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

4. Lowercase

In [None]:
data["classla_tokens_lower"] = data["classla_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["classla_tokens_lower"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_cc)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

5. Remove punctuation/numbers/both

In [None]:
tqdm.pandas()

data["classla_nopunctuation_tokens"] = data["Comment"].progress_apply(lambda comment: classla_tokenize(re.sub(r"[^\w\s]", " ", comment)))
data["classla_nonumbers_tokens"] = data["Comment"].progress_apply(lambda comment: classla_tokenize(re.sub(r"[0-9]+", " ", comment)))
data["classla_nopunctuationnumbers_tokens"] = data["Comment"].progress_apply(lambda comment: classla_tokenize(re.sub(r"[0-9]+", " ", re.sub(r"[^\w\s]", " ", comment))))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["classla_nopunctuation_tokens", "classla_nonumbers_tokens", "classla_nopunctuationnumbers_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_cc)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

### Read embeddings cc_sr_300

In [None]:
embedding_file, vectors_name = ("embeddings/cc.sr.300.vec", "cc_sr_300")
print(f"Start reading {embedding_file}.")
embeddings_sr = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in [
            #"classla_tokens", "whitespace_tokens", "word_tokens", 
                    # "snake_classla_tokens", "camel_classla_tokens","snake_camel_classla_tokens", 
                    # "stemmed_classla_tokens", "lema_classla_tokens",
                    # "classla_tokens_lower", 
                    # "classla_nopunctuation_tokens", 
                    "classla_nonumbers_tokens", "classla_nopunctuationnumbers_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_sr)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

### Read embeddings embed.hr-token.ft.sg

In [None]:
embedding_file, vectors_name = ("embeddings/embed.hr-token.ft.sg.vec", "embed_hr")
print(f"Start reading {embedding_file}.")
embeddings_hr = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["classla_tokens", "whitespace_tokens", "word_tokens", 
                    "snake_classla_tokens", "camel_classla_tokens","snake_camel_classla_tokens", 
                    "stemmed_classla_tokens", "lema_classla_tokens",
                    "classla_tokens_lower", 
                    "classla_nopunctuation_tokens", "classla_nonumbers_tokens", "classla_nopunctuationnumbers_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_hr)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

In [None]:
data["lower_lema_classla_tokens"] = data["lema_classla_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["lower_lema_classla_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_hr)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

In [None]:
embedding_file, vectors_name = ("embeddings/cc.sr.300.vec", "cc_sr_300")
print(f"Start reading {embedding_file}.")
embeddings_sr = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["lower_lema_classla_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_sr)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

In [None]:
embedding_file, vectors_name = ("embeddings/cc.sh.300.vec", "cc_sh_300")
print(f"Start reading {embedding_file}.")
embeddings_cc = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["lower_lema_classla_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_cc)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

In [None]:
tqdm.pandas()

data["lema_classla_nopunctuation_tokens"] = data["Comment"].progress_apply(lambda comment: lemma_sr(classla_tokenize(re.sub(r"[^\w\s]", " ", comment))))
data["lema_classla_nonumbers_tokens"] = data["Comment"].progress_apply(lambda comment: lemma_sr(classla_tokenize(re.sub(r"[0-9]+", " ", comment))))
data["lema_classla_nopunctuationnumbers_tokens"] = data["Comment"].progress_apply(lambda comment: lemma_sr(classla_tokenize(re.sub(r"[0-9]+", " ", re.sub(r"[^\w\s]", " ", comment)))))

In [None]:
data["lower_lema_classla_nopunctuation_tokens"] = data["lema_classla_nopunctuation_tokens"].apply(lambda tokens: [token.lower() for token in tokens])
data["lower_lema_classla_nonumbers_tokens"] = data["lema_classla_nonumbers_tokens"].apply(lambda tokens: [token.lower() for token in tokens])
data["lower_lema_classla_nopunctuationnumbers_tokens"] = data["lema_classla_nopunctuationnumbers_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
for (embedding_file, vectors_name) in [("embeddings/embed.hr-token.ft.sg.vec", "embed_hr"), ("embeddings/cc.sr.300.vec", "cc_sr_300"), ("embeddings/cc.sh.300.vec", "cc_sh_300")]:
    print(f"Start reading {embedding_file}.")
    embeddings = KeyedVectors.load_word2vec_format(embedding_file)
    print(f"End reading {embedding_file}.")

    for model_name, (estimator, hyper_params) in evaluation_params.items():
        for y_name in ["y8", "y6", "y2"]:
            # Try different tokenizers
            scores = {}
            for x_name in ["lema_classla_nopunctuation_tokens", "lema_classla_nonumbers_tokens", "lema_classla_nopunctuationnumbers_tokens", 
                            "lower_lema_classla_nopunctuation_tokens", "lower_lema_classla_nonumbers_tokens", "lower_classla_nopunctuationnumbers_tokens"]:
                score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings)
                scores[x_name] = score
            print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

In [None]:
data["lower_classla_nopunctuation_tokens"] = data["classla_nopunctuation_tokens"].apply(lambda tokens: [token.lower() for token in tokens])
data["lower_classla_nonumbers_tokens"] = data["classla_nonumbers_tokens"].apply(lambda tokens: [token.lower() for token in tokens])
data["lower_classla_nopunctuationnumbers_tokens"] = data["classla_nopunctuationnumbers_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
for (embedding_file, vectors_name) in [("embeddings/embed.hr-token.ft.sg.vec", "embed_hr"), ("embeddings/cc.sr.300.vec", "cc_sr_300"), ("embeddings/cc.sh.300.vec", "cc_sh_300")]:
    print(f"Start reading {embedding_file}.")
    embeddings = KeyedVectors.load_word2vec_format(embedding_file)
    print(f"End reading {embedding_file}.")

    for model_name, (estimator, hyper_params) in evaluation_params.items():
        for y_name in ["y8", "y6", "y2"]:
            # Try different tokenizers
            scores = {}
            for x_name in ["lower_classla_nopunctuation_tokens", "lower_classla_nonumbers_tokens", "lower_classla_nopunctuationnumbers_tokens"]:
                score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings)
                scores[x_name] = score
            print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

In [None]:
(embedding_file, vectors_name) = ("embeddings/embed.sr-token.ft.sg.vec", "embed_sr")
print(f"Start reading {embedding_file}.")
embeddings_sr = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in [
                "classla_tokens", "whitespace_tokens", "word_tokens", 
                "snake_classla_tokens", "camel_classla_tokens","snake_camel_classla_tokens", 
                "stemmed_classla_tokens", "lema_classla_tokens",
                "classla_tokens_lower", "lower_lema_classla_tokens",
                "lema_classla_nopunctuation_tokens", "lema_classla_nonumbers_tokens", "lema_classla_nopunctuationnumbers_tokens", 
                "lower_lema_classla_nopunctuation_tokens", "lower_lema_classla_nonumbers_tokens", "lower_lema_classla_nopunctuationnumbers_tokens",
                "classla_nopunctuation_tokens", "classla_nonumbers_tokens", "classla_nopunctuationnumbers_tokens",
                "lower_classla_nopunctuation_tokens", "lower_classla_nonumbers_tokens", "lower_classla_nopunctuationnumbers_tokens"
                ]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_sr)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

### Per programming language analysis

Best: Classla tokens, lower

In [19]:
data["classla_tokens_lower"] = data["classla_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [20]:
result_file = "./results/boe_SR_per_language.csv"

embed_sr

In [21]:
(embedding_file, vectors_name) = ("embeddings/embed.sr-token.ft.sg.vec", "embed_sr")
print(f"Start reading {embedding_file}.")
embeddings_sr = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

Start reading embeddings/embed.sr-token.ft.sg.vec.
End reading embeddings/embed.sr-token.ft.sg.vec.


embed_hr

In [22]:
embedding_file, vectors_name = ("embeddings/embed.hr-token.ft.sg.vec", "embed_hr")
print(f"Start reading {embedding_file}.")
embeddings_hr = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

Start reading embeddings/embed.hr-token.ft.sg.vec.
End reading embeddings/embed.hr-token.ft.sg.vec.


cc_sh_300

In [23]:
embedding_file, vectors_name = ("embeddings/cc.sh.300.vec", "cc_sh_300")
print(f"Start reading {embedding_file}.")
embeddings_cc = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

Start reading embeddings/cc.sh.300.vec.
End reading embeddings/cc.sh.300.vec.


cc_sr_300

In [24]:
embedding_file, vectors_name = ("embeddings/cc.sr.300.vec", "cc_sr_300")
print(f"Start reading {embedding_file}.")
embeddings_sr = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

Start reading embeddings/cc.sr.300.vec.
End reading embeddings/cc.sr.300.vec.


In [None]:
from sklearn.metrics import f1_score

for embeddings, embedding_name in [(embeddings_sr, "embed_sr"), (embeddings_hr, "embed_hr"), (embeddings_cc, "cc_sh_300"), (embeddings_sr, "cc_sr_300")]:
    for lang_name in ['C', 'C++', 'C#', 'Java', 'TypeScript', 'Python', 'SQL']:
        data_train = data[data.ProgrammingLanguageID != lang_name]
        data_test = data[data.ProgrammingLanguageID == lang_name]

        for model_name, (estimator, _) in evaluation_params.items():
            for y_name in ["y8", "y6", "y2"]:
                # Try different cases
                for x_name in ["classla_tokens_lower"]:
                    X = data_train[x_name]
                    y = data_train[y_name]
                    # Vectorize.
                    X, nan_comments_mask = vectorize(X, embeddings)
                    X = X[~nan_comments_mask]
                    y = y[~nan_comments_mask]
                    
                    score_name = make_score_name(f"{lang_name}-{embedding_name}-{x_name}", model_name, y.nunique())
                    print("Evaluation ", score_name)

                    pipeline = estimator
                    pipeline.fit(X, y)

                    X_test = data_test[x_name]
                    y_test = data_test[y_name]
                    # Vectorize.
                    X_test, nan_comments_mask = vectorize(X_test, embeddings)
                    X_test = X_test[~nan_comments_mask]
                    y_test = y_test[~nan_comments_mask]
                    
                    y_pred = pipeline.predict(X_test)
                    score = f1_score(y_test, y_pred, average="macro")

                    pd.DataFrame(
                        {"score_name": [score_name],
                        "score_value": [score]}
                    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)