In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
import pandas as pd
import os
import numpy as np
from gensim.models import KeyedVectors

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np
from string import punctuation
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
import re

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline


### Read data

In [25]:
input_file = "./data/corpus.csv"
corpus = pd.read_csv("./data/corpus.csv", dtype="string")

In [26]:
corpus.head()

In [27]:
language = "EN"
data = corpus[corpus.NaturalLanguageID == language]

In [28]:
data.columns

In [6]:
# data = data[["Comment", "y8", "y6", "y2"]]

### Embeddings

In [7]:
total_tokens = 0
total_tokens_without_embedding = 0

In [8]:
def get_comment_vector(embeddings, comment):
    global total_tokens, total_tokens_without_embedding

    comment_vector = []
    num_tokens = 0

    for token in comment:
        total_tokens += 1
        try:
            if num_tokens == 0:
                comment_vector = embeddings[token]
            else:
                comment_vector = np.add(comment_vector, embeddings[token])
            num_tokens += 1
        except:
            total_tokens_without_embedding += 1
            pass
    if num_tokens == 0:
        return np.nan
    return np.asarray(comment_vector) / num_tokens

In [9]:
def vectorize(comments, embeddings):
    global total_tokens, total_tokens_without_embedding

    total_tokens = 0
    total_tokens_without_embedding = 0

    X_vectors = comments.apply(lambda comment: get_comment_vector(embeddings, comment))
    X_vectors = X_vectors.apply(pd.Series)

    nan_comments_mask = X_vectors.isnull().iloc[:, 0]
    num_nan_comments = nan_comments_mask.sum()
    print(f"{num_nan_comments} comments have no embeddings")

    print (f"Total tokens {total_tokens}, out of whicih {total_tokens_without_embedding} do not have embeddings -- {total_tokens_without_embedding/total_tokens*100:.4}%.")
    return X_vectors, nan_comments_mask


### Evaluate

In [10]:
def write_results(result_file, score_name, score_value):
    pd.DataFrame(
        {"score_name": [score_name],
        "score_value": [score_value]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

In [11]:
def make_score_name(score_name, model_name, num_classes):
    return f"{score_name}-{model_name}-{num_classes}"

In [12]:
def evaluate(data, x_column_name, y_column_name, result_file, score_name, model_name, estimator, hyper_params, embeddings):
    print(estimator)
    
    X = data[x_column_name]
    y = data[y_column_name]

    # Vectorize.
    X, nan_comments_mask = vectorize(X, embeddings)
    X = X[~nan_comments_mask]
    y = y[~nan_comments_mask]

    full_score_name = make_score_name(score_name, model_name, y.nunique())
    print(f"--------Evaluating {full_score_name} --------")
    gs_estimator = GridSearchCV(
        estimator, hyper_params, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)

    scores = cross_validate(
        gs_estimator, X, y, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)
    mean_score = np.mean(scores["test_score"])

    pd.DataFrame(
        {"score_name": [full_score_name],
        "score_value": [mean_score]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

    return mean_score

### Models

In [35]:
evaluation_params = {
    "svm": (LinearSVC(), {"C": [0.001, 0.01, 0.1, 1, 10]}), 
}

In [None]:
result_file = "./results/boe_EN.csv"

### Read embeddings wiki_news_300

In [None]:
embedding_file, vectors_name = ("embeddings/wiki-news-300d-1M.vec", "wiki_news_300")
print(f"Start reading {embedding_file}.")
embeddings_wiki = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

### Preprocessing

1. Tokenization

In [14]:
data.head()

In [15]:
# import nltk
# nltk.download('punkt')

In [29]:
def my_nltk_tokenizer(comment):
    try:
        return word_tokenize(comment)
    except:
        print("NLTK tokenization exception for", comment)
        try:
            return word_tokenize(comment.strip(punctuation))
        except:
            print("NLTK tokenization withput punctuation exception for", comment)
            return comment.split(" ")

data["nltk_tokens"] = data["Comment"].apply(lambda comment: my_nltk_tokenizer(comment))

In [17]:
whitespace_tokenizer = WhitespaceTokenizer()


def my_whitespace_tokenizer(comment):
    return whitespace_tokenizer.tokenize(comment)

data["whitespace_tokens"] = data["Comment"].apply(lambda comment: my_whitespace_tokenizer(comment))

In [18]:
def my_word_tokenizer(comment):
    token_pattern = re.compile(r"(?u)\b\w\w+\b")
    return token_pattern.findall(comment)

data["word_tokens"] = data["Comment"].apply(lambda comment: my_word_tokenizer(comment))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["nltk_tokens", "whitespace_tokens","word_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_wiki)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

2. snake_case/CamelCase/both

In [None]:
def snake_case_tokenize(tokens):
    output_tokens = []
    for token in tokens:
        output_tokens.extend(token.split("_"))

    return list(filter(None, output_tokens))

In [None]:
data["snake_nltk_tokens"] = data["nltk_tokens"].apply(lambda tokens: snake_case_tokenize(tokens))

In [None]:
def camel_case_tokenize(tokens):
    try:
        output_tokens = []
        for token in tokens:
            if len(token) == 0:
                continue
            new_tokens = []
            new_tokens.append(str(token[0]))
            for c in token[1:]:
                if new_tokens[-1][-1].islower() and c.isupper():
                    new_tokens.append(str(c))
                else:
                    new_tokens[-1] += c

            output_tokens.extend(new_tokens)

        return list(filter(None, output_tokens))
    except:
        print("-------------- CAMEL CASE ERROR ------------")
        print(tokens)
        return tokens

In [None]:
data["camel_nltk_tokens"] = data["nltk_tokens"].apply(lambda tokens: camel_case_tokenize(tokens))

In [None]:
data["snake_camel_nltk_tokens"] = data["snake_nltk_tokens"].apply(lambda tokens: camel_case_tokenize(tokens))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["snake_nltk_tokens", "camel_nltk_tokens","snake_camel_nltk_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_wiki)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

3. Stemming/Lemmatization

In [None]:
stemmer = PorterStemmer()

def stem_en(tokens):
    return [stemmer.stem(token) for token in tokens]

In [None]:
data["stem_nltk_tokens"] = data["nltk_tokens"].apply(lambda tokens: stem_en(tokens))

In [None]:
lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"

def lemma_en(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [None]:
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')

In [None]:
data["lema_nltk_tokens"] = data["nltk_tokens"].apply(lambda tokens: lemma_en(tokens))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["lema_nltk_tokens", "stem_nltk_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_wiki)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

4. Lowercase

In [None]:
data["lema_nltk_lower_tokens"] = data["lema_nltk_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["lema_nltk_lower_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_wiki)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

4. Without lema.

In [None]:
data["nltk_lower_tokens"] = data["nltk_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["nltk_lower_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_wiki)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

5. Remove punctuation/numbers/both

In [None]:
data["lema_nltk_nopunctuation_tokens"] = data["Comment"].apply(lambda comment: lemma_en(my_nltk_tokenizer(re.sub(r"[^\w\s]", " ", comment))))

In [None]:
data["lema_nltk_nonumbers_tokens"] = data["Comment"].apply(lambda comment: lemma_en(my_nltk_tokenizer(re.sub(r"[0-9]+", " ", comment))))

In [None]:
data["lema_nltk_nopunctuationnumbers_tokens"] = data["Comment"].apply(lambda comment: lemma_en(my_nltk_tokenizer(re.sub(r"[0-9]+", " ", re.sub(r"[^\w\s]", " ", comment)))))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["lema_nltk_nopunctuation_tokens", "lema_nltk_nonumbers_tokens", "lema_nltk_nopunctuationnumbers_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_wiki)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

In [None]:
embeddings_wiki["#"]

5. Without lemmatization.

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["nltk_nopunctuation_tokens", "nltk_nonumbers_tokens", "nltk_nopunctuationnumbers_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_wiki)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

### Read embeddings crawl_300d_2M

In [None]:
embedding_file, vectors_name = ("embeddings/crawl-300d-2M.vec", "crawl_300d_2M")
print(f"Start reading {embedding_file}.")
embeddings_crawl = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

1. Tokenization

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["nltk_tokens", "whitespace_tokens","word_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_crawl)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

2. snake_case/CamelCase/both


In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["snake_nltk_tokens", "camel_nltk_tokens","snake_camel_nltk_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_crawl)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

3. Stemming/Lemmatization

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["lema_nltk_tokens", "stem_nltk_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_crawl)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

4. Lowercase

In [None]:
data["nltk_lower_tokens"] = data["nltk_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["nltk_lower_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_crawl)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

5. Remove punctuation/numbers/both

In [None]:
data["nltk_nopunctuation_tokens"] = data["Comment"].apply(lambda comment: my_nltk_tokenizer(re.sub(r"[^\w\s]", " ", comment)))

In [None]:
data["nltk_nonumbers_tokens"] = data["Comment"].apply(lambda comment: my_nltk_tokenizer(re.sub(r"[0-9]+", " ", comment)))

In [None]:
data["nltk_nopunctuationnumbers_tokens"] = data["Comment"].apply(lambda comment: my_nltk_tokenizer(re.sub(r"[0-9]+", " ", re.sub(r"[^\w\s]", " ", comment))))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        scores = {}
        for x_name in ["nltk_nopunctuation_tokens", "nltk_nonumbers_tokens", "nltk_nopunctuationnumbers_tokens"]:
            score = evaluate(data, x_name, y_name, result_file, vectors_name+"-"+x_name, model_name, estimator, hyper_params, embeddings_crawl)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

### Per programming language analysis

Best: NLTK tokens

In [21]:
result_file = "./results/boe_EN_per_language.csv"

CRAWL

In [22]:
embedding_file, vectors_name = ("embeddings/crawl-300d-2M.vec", "crawl_300d_2M")
print(f"Start reading {embedding_file}.")
embeddings_crawl = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

Start reading embeddings/crawl-300d-2M.vec.
End reading embeddings/crawl-300d-2M.vec.


WIKI NEWS

In [23]:
embedding_file, vectors_name = ("embeddings/wiki-news-300d-1M.vec", "wiki_news_300")
print(f"Start reading {embedding_file}.")
embeddings_wiki = KeyedVectors.load_word2vec_format(embedding_file)
print(f"End reading {embedding_file}.")

Start reading embeddings/wiki-news-300d-1M.vec.
End reading embeddings/wiki-news-300d-1M.vec.


In [None]:
from sklearn.metrics import f1_score

for embeddings, embedding_name in [(embeddings_crawl, "crawl_300d_2M"), (embeddings_wiki, "wiki_news_300")]:
    for lang_name in ['C', 'C++', 'C#', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'Python', 'SQL']:
        data_train = data[data.ProgrammingLanguageID != lang_name]
        data_test = data[data.ProgrammingLanguageID == lang_name]

        for model_name, (estimator, _) in evaluation_params.items():
            for y_name in ["y8", "y6", "y2"]:
                # Try different cases
                for x_name in ["nltk_tokens"]:
                    X = data_train[x_name]
                    y = data_train[y_name]
                    # Vectorize.
                    X, nan_comments_mask = vectorize(X, embeddings)
                    X = X[~nan_comments_mask]
                    y = y[~nan_comments_mask]
                    
                    score_name = make_score_name(f"{lang_name}-{embedding_name}-{x_name}", model_name, y.nunique())
                    print("Evaluation ", score_name)

                    pipeline = estimator
                    pipeline.fit(X, y)

                    X_test = data_test[x_name]
                    y_test = data_test[y_name]
                    # Vectorize.
                    X_test, nan_comments_mask = vectorize(X_test, embeddings)
                    X_test = X_test[~nan_comments_mask]
                    y_test = y_test[~nan_comments_mask]
                    
                    y_pred = pipeline.predict(X_test)
                    score = f1_score(y_test, y_pred, average="macro")

                    pd.DataFrame(
                        {"score_name": [score_name],
                        "score_value": [score]}
                    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)