In [1]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np
from string import punctuation
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
import re

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline


### Read data

In [2]:
input_file = "./data/corpus.csv"
corpus = pd.read_csv("./data/corpus.csv", dtype="string")

In [3]:
corpus.head()

### EN data

In [22]:
language = "EN"
data = corpus[corpus.NaturalLanguageID == language]

In [23]:
data.columns

In [6]:
# data = data[["Comment", "y8", "y6", "y2"]]

### Write tokens

In [7]:
def write_tokens(name, data, lang=language):
    output_file_name = f"data/tokens/{lang}_{name}.csv"
    
    # X_temp = X.apply(lambda tokens: " ".join(tokens))
    
    print(f"Writing {name} {data.shape} to {output_file_name}.")
    data.to_csv(output_file_name, index=False)

### Evaluate

In [8]:
def write_results(result_file, score_name, score_value):
    pd.DataFrame(
        {"score_name": [score_name],
        "score_value": [score_value]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

In [9]:
def make_score_name(score_name, model_name, num_classes):
    return f"{score_name}-{model_name}-{num_classes}"

In [10]:
def evaluate(data, x_column_name, y_column_name, result_file, score_name, model_name, estimator, hyper_params):
    print(estimator)
    
    X = data[x_column_name]
    y = data[y_column_name]

    full_score_name = make_score_name(score_name, model_name, y.nunique())
    print(f"--------Evaluating {full_score_name} --------")
    gs_estimator = GridSearchCV(
        estimator, hyper_params, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)

    scores = cross_validate(
        gs_estimator, X, y, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)
    mean_score = np.mean(scores["test_score"])

    pd.DataFrame(
        {"score_name": [full_score_name],
        "score_value": [mean_score]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

    return mean_score

### Models

In [11]:
evaluation_params = {
    "svm": (LinearSVC(), {"linearsvc__C": [0.001, 0.01, 0.1, 1, 10]}), 
    "log": (LogisticRegression(max_iter=800), {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10]}), 
    "mnb": (MultinomialNB(), {"multinomialnb__alpha": [0.001, 0.01, 0.1, 1, 10]})
}

In [12]:
result_file = "./results/bow_EN.csv"

1. Tokenization

In [24]:
def my_nltk_tokenizer(comment):
    try:
        return word_tokenize(comment)
    except:
        print("NLTK tokenization exception for", comment)
        try:
            return word_tokenize(comment.strip(punctuation))
        except:
            print("NLTK tokenization withput punctuation exception for", comment)
            return comment.split(" ")

data["nltk_tokens"] = data["Comment"].apply(lambda comment: my_nltk_tokenizer(comment))

In [14]:
whitespace_tokenizer = WhitespaceTokenizer()


def my_whitespace_tokenizer(comment):
    return whitespace_tokenizer.tokenize(comment)

data["whitespace_tokens"] = data["Comment"].apply(lambda comment: my_whitespace_tokenizer(comment))

In [15]:
def my_word_tokenizer(comment):
    token_pattern = re.compile(r"(?u)\b\w\w+\b")
    return token_pattern.findall(comment)

data["word_tokens"] = data["Comment"].apply(lambda comment: my_word_tokenizer(comment))

In [16]:
def dummy_tokenize(tokens):
    return tokens

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["nltk_tokens", "whitespace_tokens","word_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

2. snake_case/CamelCase/both

In [17]:
def snake_case_tokenize(tokens):
    output_tokens = []
    for token in tokens:
        output_tokens.extend(token.split("_"))

    return list(filter(None, output_tokens))

In [None]:
data["snake_nltk_tokens"] = data["nltk_tokens"].apply(lambda tokens: snake_case_tokenize(tokens))

In [None]:
def camel_case_tokenize(tokens):
    try:
        output_tokens = []
        for token in tokens:
            if len(token) == 0:
                continue
            new_tokens = []
            new_tokens.append(str(token[0]))
            for c in token[1:]:
                if new_tokens[-1][-1].islower() and c.isupper():
                    new_tokens.append(str(c))
                else:
                    new_tokens[-1] += c

            output_tokens.extend(new_tokens)

        return list(filter(None, output_tokens))
    except:
        print("-------------- CAMEL CASE ERROR ------------")
        print(tokens)
        return tokens

In [None]:
data["camel_nltk_tokens"] = data["nltk_tokens"].apply(lambda tokens: camel_case_tokenize(tokens))

In [None]:
data["snake_camel_nltk_tokens"] = data["snake_nltk_tokens"].apply(lambda tokens: camel_case_tokenize(tokens))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["nltk_tokens", "snake_nltk_tokens", "camel_nltk_tokens","snake_camel_nltk_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

3. Stemming/Lemmatization

In [None]:
stemmer = PorterStemmer()

def stem_en(tokens):
    return [stemmer.stem(token) for token in tokens]

In [None]:
data["stem_nltk_tokens"] = data["nltk_tokens"].apply(lambda tokens: stem_en(tokens))

In [None]:
lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"

def lemma_en(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [None]:
data["lema_nltk_tokens"] = data["nltk_tokens"].apply(lambda tokens: lemma_en(tokens))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["nltk_tokens", "lema_nltk_tokens", "stem_nltk_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

4. Lowercase

In [None]:
data["nltk_tokens_lower"] = data["nltk_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["nltk_tokens", "nltk_tokens_lower"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

5. Remove punctuation/numbers/both

In [None]:
data["nltk_nopunctuation_tokens"] = data["Comment"].apply(lambda comment: my_nltk_tokenizer(re.sub(r"[^\w\s]", " ", comment)))
data["nltk_nonumbers_tokens"] = data["Comment"].apply(lambda comment: my_nltk_tokenizer(re.sub(r"[0-9]+", " ", comment)))
data["nltk_nopunctuationnumbers_tokens"] = data["Comment"].apply(lambda comment: my_nltk_tokenizer(re.sub(r"[0-9]+", " ", re.sub(r"[^\w\s]", " ", comment))))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["nltk_nopunctuation_tokens", "nltk_nonumbers_tokens", "nltk_nopunctuationnumbers_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

6. Unigrams/Bigrams/Trigrams

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["nltk_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize, ngram_range=(1, 2)),  estimator)
            score = evaluate(data, x_name, y_name, result_file, "(1_2)"+x_name, model_name, pipeline, hyper_params)
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize, ngram_range=(1, 3)),  estimator)
            score = evaluate(data, x_name, y_name, result_file, "(1_3)"+x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

7. TfIdf

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["nltk_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize), TfidfTransformer(use_idf=False), estimator)
            score = evaluate(data, x_name, y_name, result_file, "tf"+x_name, model_name, pipeline, hyper_params)
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize), TfidfTransformer(use_idf=True), estimator)
            score = evaluate(data, x_name, y_name, result_file, "tfidf"+x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

8. Tfidf with bigrams

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["nltk_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize, ngram_range=(1, 2)), TfidfTransformer(use_idf=False), estimator)
            score = evaluate(data, x_name, y_name, result_file, "(1_2)tf"+x_name, model_name, pipeline, hyper_params)
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize, ngram_range=(1, 2)), TfidfTransformer(use_idf=True), estimator)
            score = evaluate(data, x_name, y_name, result_file, "(1_2)tfidf"+x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

### Per programming language analysis

Best: NLTK tokens, bigrams

In [26]:
result_file = "bow_EN_per_language.csv"

In [29]:
from sklearn.metrics import f1_score


for lang_name in ['C', 'C++', 'C#', 'Java', 'JavaScript', 'TypeScript', 'PHP', 'Python', 'SQL']:
    data_train = data[data.ProgrammingLanguageID != lang_name]
    data_test = data[data.ProgrammingLanguageID == lang_name]

    for model_name, (estimator, _) in evaluation_params.items():
        for y_name in ["y8", "y6", "y2"]:
            # Try different cases
            for x_name in ["nltk_tokens"]:
                X = data_train[x_name]
                y = data_train[y_name]
                score_name = make_score_name(lang_name+"-(1_2)"+x_name, model_name, y.nunique())
                print("Evaluation ", score_name)

                pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize, ngram_range=(1, 2)),  estimator)
                pipeline.fit(X, y)

                X_test = data_test[x_name]
                y_test = data_test[y_name]
                y_pred = pipeline.predict(data_test[x_name])
                score = f1_score(y_test, y_pred, average="macro")

                pd.DataFrame(
                    {"score_name": [score_name],
                    "score_value": [score]}
                ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

Evaluation  C-(1_2)nltk_tokens-svm-8
Evaluation  C-(1_2)nltk_tokens-svm-6
Evaluation  C-(1_2)nltk_tokens-svm-2
Evaluation  C-(1_2)nltk_tokens-log-8
Evaluation  C-(1_2)nltk_tokens-log-6
Evaluation  C-(1_2)nltk_tokens-log-2
Evaluation  C-(1_2)nltk_tokens-mnb-8
Evaluation  C-(1_2)nltk_tokens-mnb-6
Evaluation  C-(1_2)nltk_tokens-mnb-2
Evaluation  C++-(1_2)nltk_tokens-svm-8
Evaluation  C++-(1_2)nltk_tokens-svm-6
Evaluation  C++-(1_2)nltk_tokens-svm-2
Evaluation  C++-(1_2)nltk_tokens-log-8
Evaluation  C++-(1_2)nltk_tokens-log-6
Evaluation  C++-(1_2)nltk_tokens-log-2
Evaluation  C++-(1_2)nltk_tokens-mnb-8
Evaluation  C++-(1_2)nltk_tokens-mnb-6
Evaluation  C++-(1_2)nltk_tokens-mnb-2
Evaluation  C#-(1_2)nltk_tokens-svm-8
Evaluation  C#-(1_2)nltk_tokens-svm-6
Evaluation  C#-(1_2)nltk_tokens-svm-2
Evaluation  C#-(1_2)nltk_tokens-log-8
Evaluation  C#-(1_2)nltk_tokens-log-6
Evaluation  C#-(1_2)nltk_tokens-log-2
Evaluation  C#-(1_2)nltk_tokens-mnb-8
Evaluation  C#-(1_2)nltk_tokens-mnb-6
Evaluation  