In [1]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

import pandas as pd
import numpy as np
from string import punctuation
from nltk.tokenize import word_tokenize, WhitespaceTokenizer
import re

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline

from tqdm import tqdm
import classla


### Read data

In [2]:
input_file = "./data/corpus.csv"
corpus = pd.read_csv("./data/corpus.csv", dtype="string")

In [3]:
corpus.head()

### SR data

In [4]:
language = "SR"
data = corpus[corpus.NaturalLanguageID == language]

In [5]:
data.columns

Remove ide

In [6]:
data = data[data.y8 != "ide"]

### Write tokens

In [7]:
def write_tokens(name, data, lang=language):
    output_file_name = f"data/tokens/{lang}_{name}.csv"
    
    # X_temp = X.apply(lambda tokens: " ".join(tokens))
    
    print(f"Writing {name} {data.shape} to {output_file_name}.")
    data.apply(lambda tokens: " ".join(tokens)).to_csv(output_file_name, index=False)

### Evaluate

In [8]:
def write_results(result_file, score_name, score_value):
    pd.DataFrame(
        {"score_name": [score_name],
        "score_value": [score_value]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

In [9]:
def make_score_name(score_name, model_name, num_classes):
    return f"{score_name}-{model_name}-{num_classes}"

In [10]:
def evaluate(data, x_column_name, y_column_name, result_file, score_name, model_name, estimator, hyper_params):
    print(estimator)
    
    X = data[x_column_name]
    y = data[y_column_name]

    full_score_name = make_score_name(score_name, model_name, y.nunique())
    print(f"--------Evaluating {full_score_name} --------")
    gs_estimator = GridSearchCV(
        estimator, hyper_params, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)

    scores = cross_validate(
        gs_estimator, X, y, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)
    mean_score = np.mean(scores["test_score"])

    pd.DataFrame(
        {"score_name": [full_score_name],
        "score_value": [mean_score]}
    ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

    return mean_score

### Models

In [11]:
evaluation_params = {
    "svm": (LinearSVC(), {"linearsvc__C": [0.001, 0.01, 0.1, 1, 10]}), 
    "log": (LogisticRegression(max_iter=800), {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10]}), 
    "mnb": (MultinomialNB(), {"multinomialnb__alpha": [0.001, 0.01, 0.1, 1, 10]})
}

In [12]:
result_file = "./results/bow_SR.csv"

1. Tokenization

In [8]:
classla.download('sr', type='nonstandard')

In [9]:
sr_pipeline = classla.Pipeline("sr", type="nonstandard")

In [10]:
def classla_tokenize(comment):
    try:
        doc = sr_pipeline(comment)
        return list([word.text for word in doc.iter_words()])
    except:
        print(f"SR tokenize ERROR for comment: {comment}")
        return comment.split(" ")

In [None]:
tqdm.pandas()
data["classla_tokens"] = data["Comment"].progress_apply(lambda comment: classla_tokenize(comment))

In [None]:
write_tokens("classla", data["classla_tokens"], "SR")

In [None]:
whitespace_tokenizer = WhitespaceTokenizer()


def my_whitespace_tokenizer(comment):
    return whitespace_tokenizer.tokenize(comment)

data["whitespace_tokens"] = data["Comment"].apply(lambda comment: my_whitespace_tokenizer(comment))

In [None]:
def my_word_tokenizer(comment):
    token_pattern = re.compile(r"(?u)\b\w\w+\b")
    return token_pattern.findall(comment)

data["word_tokens"] = data["Comment"].apply(lambda comment: my_word_tokenizer(comment))

In [12]:
def dummy_tokenize(tokens):
    return tokens

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different tokenizers
        scores = {}
        for x_name in ["classla_tokens", "whitespace_tokens","word_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

2. snake_case/CamelCase/both

In [None]:
def snake_case_tokenize(tokens):
    output_tokens = []
    for token in tokens:
        output_tokens.extend(token.split("_"))

    return list(filter(None, output_tokens))

In [None]:
data["snake_classla_tokens"] = data["classla_tokens"].apply(lambda tokens: snake_case_tokenize(tokens))

In [None]:
def camel_case_tokenize(tokens):
    try:
        output_tokens = []
        for token in tokens:
            if len(token) == 0:
                continue
            new_tokens = []
            new_tokens.append(str(token[0]))
            for c in token[1:]:
                if new_tokens[-1][-1].islower() and c.isupper():
                    new_tokens.append(str(c))
                else:
                    new_tokens[-1] += c

            output_tokens.extend(new_tokens)

        return list(filter(None, output_tokens))
    except:
        print("-------------- CAMEL CASE ERROR ------------")
        print(tokens)
        return tokens

In [None]:
data["camel_classla_tokens"] = data["classla_tokens"].apply(lambda tokens: camel_case_tokenize(tokens))

In [None]:
data["snake_camel_classla_tokens"] = data["snake_classla_tokens"].apply(lambda tokens: camel_case_tokenize(tokens))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["snake_classla_tokens", "camel_classla_tokens","snake_camel_classla_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

3. Stemming/Lemmatization

In [None]:
stemmed_df = pd.read_csv("data/tokens/SR_classla_stemmed.csv", header=0, dtype="string")

In [None]:
stemmed_df.index = data.index

In [None]:
data.join(stemmed_df).head()

In [None]:
data["stemmed_classla_tokens"] = stemmed_df["stemmed_classla_tokens"].apply(lambda comment: comment.split(" "))

In [None]:
data.head()

In [None]:
sr_pretokenized_pipeline = classla.Pipeline(
    "sr", type="nonstandard", tokenize_pretokenized=True)

In [None]:
def lemma_sr(tokens):
    try:
        doc = sr_pretokenized_pipeline(" ".join(tokens))
        return list([word.lemma for word in doc.iter_words()])
    except:
        print(f"Lema SR error for tokens: {tokens}")
        return tokens

In [None]:
tqdm.pandas()
data["lema_classla_tokens"] = data["classla_tokens"].progress_apply(lambda tokens: lemma_sr(tokens))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["lema_classla_tokens", "stemmed_classla_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

4. Lowercase

In [None]:
data["classla_tokens_lower"] = data["classla_tokens"].apply(lambda tokens: [token.lower() for token in tokens])

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["classla_tokens_lower"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

5. Remove punctuation/numbers/both

In [None]:
tqdm.pandas()

data["classla_nopunctuation_tokens"] = data["Comment"].progress_apply(lambda comment: classla_tokenize(re.sub(r"[^\w\s]", " ", comment)))
data["classla_nonumbers_tokens"] = data["Comment"].progress_apply(lambda comment: classla_tokenize(re.sub(r"[0-9]+", " ", comment)))
data["classla_nopunctuationnumbers_tokens"] = data["Comment"].progress_apply(lambda comment: classla_tokenize(re.sub(r"[0-9]+", " ", re.sub(r"[^\w\s]", " ", comment))))

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["classla_nopunctuation_tokens", "classla_nonumbers_tokens", "classla_nopunctuationnumbers_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize),  estimator)
            score = evaluate(data, x_name, y_name, result_file, x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

6. Bigrams/Trigrams

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["classla_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize, ngram_range=(1, 2)),  estimator)
            score = evaluate(data, x_name, y_name, result_file, "(1_2)"+x_name, model_name, pipeline, hyper_params)
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize, ngram_range=(1, 3)),  estimator)
            score = evaluate(data, x_name, y_name, result_file, "(1_3)"+x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

7. Tf/Tfidf

In [None]:
for model_name, (estimator, hyper_params) in evaluation_params.items():
    for y_name in ["y8", "y6", "y2"]:
        # Try different cases
        scores = {}
        for x_name in ["classla_tokens"]:
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize), TfidfTransformer(use_idf=False), estimator)
            score = evaluate(data, x_name, y_name, result_file, "tf_"+x_name, model_name, pipeline, hyper_params)
            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize), TfidfTransformer(use_idf=True), estimator)
            score = evaluate(data, x_name, y_name, result_file, "tfidf_"+x_name, model_name, pipeline, hyper_params)
            scores[x_name] = score
        print(f"{model_name}-{y_name}-best-{max(scores, key=scores.get)}")

### Per programming language analysis

Best: ReLDi/classla tokens, TF

In [18]:
result_file = "./results/bow_SR_per_language.csv"

In [13]:
classla_df = pd.read_csv("data/tokens/SR_classla.csv", header=0, dtype="string")
classla_df.index = data.index
data.join(classla_df)
data["classla_tokens"] = classla_df["classla_tokens"].apply(lambda comment: comment.split(" "))
data.head()

In [19]:
from sklearn.metrics import f1_score

for lang_name in ['C', 'C++', 'C#', 'Java', 'TypeScript', 'Python', 'SQL']:
    data_train = data[data.ProgrammingLanguageID != lang_name]
    data_test = data[data.ProgrammingLanguageID == lang_name]

    for model_name, (estimator, hyper_params) in evaluation_params.items():
        for y_name in ["y8", "y6", "y2"]:
            # Try different cases
            for x_name in ["classla_tokens"]:
                X = data_train[x_name]
                y = data_train[y_name]
                score_name = make_score_name(lang_name+"-tf_"+x_name, model_name, y.nunique())
                print("Evaluation ", score_name)

                pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize), TfidfTransformer(use_idf=False), estimator)
                gs_estimator = GridSearchCV(pipeline, hyper_params, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)
                gs_estimator.fit(X, y)

                X_test = data_test[x_name]
                y_test = data_test[y_name]
                y_pred = gs_estimator.predict(X_test)
                score = f1_score(y_test, y_pred, average="macro")

                pd.DataFrame(
                    {"score_name": [score_name],
                    "score_value": [score]}
                ).to_csv(result_file, mode="a", decimal=",", header=False, index=False)

## Top tokens

Top absolute 100 for svm and log and top 100 tokens for each class for mnb.

In [14]:
top_tokens = pd.DataFrame()

models = {"svm": (LinearSVC(max_iter=2000), "linearsvc", {"linearsvc__C": [0.001, 0.01, 0.1, 1, 10]}),
          "log": (LogisticRegression(max_iter=800), "logisticregression", {"logisticregression__C": [0.001, 0.01, 0.1, 1, 10]}),
          "mnb": (MultinomialNB(), "multinomialnb", {"multinomialnb__alpha": [0.001, 0.01, 0.1, 1, 10]})}

for model_name, (estimator, model_name_in_pipeline, hyper_params) in models.items():
    for y_name in ["y8", "y6", "y2"]:
        for x_name in ["classla_tokens"]:
            X = data[x_name]
            y = data[y_name]

            pipeline = make_pipeline(CountVectorizer(lowercase=False, tokenizer=dummy_tokenize), TfidfTransformer(use_idf=False), estimator)
            gs_estimator = GridSearchCV(
                pipeline, hyper_params, scoring="f1_macro", cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42), verbose=0, n_jobs=-1)
            result = gs_estimator.fit(X, y)

            col_name = f"{model_name}_{y_name}"
            if model_name == "mnb":
                est = gs_estimator.best_estimator_[model_name_in_pipeline]
                for row, cls in zip(est.feature_log_prob_, est.classes_):
                    weights = []
                    tokens = []
                    for i in (row).argsort()[::-1][:100]:
                        weights.append(row[i])
                        tokens.append(gs_estimator.best_estimator_['countvectorizer'].get_feature_names_out()[i])
                    top_tokens[f"{col_name}_{cls}_W"] = weights
                    top_tokens[f"{col_name}_{cls}_T"] = tokens
            else:
                weights = []
                tokens = []
                for i in np.absolute(gs_estimator.best_estimator_[model_name_in_pipeline].coef_[0]).argsort()[::-1][:100]:
                    weights.append(gs_estimator.best_estimator_[model_name_in_pipeline].coef_[0][i])
                    tokens.append(gs_estimator.best_estimator_['countvectorizer'].get_feature_names_out()[i])
                top_tokens[f"{col_name}_W"] = weights
                top_tokens[f"{col_name}_T"] = tokens

            print(f"Done {col_name}")
top_tokens.to_csv("./results/bow_SR_top_tokens.csv", index=False, mode="a")