In [10]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import nltk
import seaborn as sns

from nltk.corpus import stopwords
from pprint import pprint
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sqlalchemy import create_engine
from scipy.stats import describe
from typing import Dict, List, Union

from scripts.stats_utils import calculate_four_moments
from scripts.lda_utils import (
    clean_text, 
    calculate_perplexities, 
    extract_dominant_topics, 
    create_cat_dataframe,
    perform_grid_search,
    generate_wordcloud
)

%matplotlib inline
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [11]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")    
nltk.download("omw-1.4")

stop_words = set(nltk.corpus.stopwords.words("english"))

[nltk_data] Downloading package stopwords to /home/elang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/elang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/elang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/elang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [12]:
conn = create_engine("postgresql://root:password@localhost:5434/hf")

In [13]:
df_hf = pd.read_sql_query("""SELECT DISTINCT ON (h.commit_hash)  h.commit_message, h.commit_timestamp FROM hf_commits AS h 
    INNER JOIN events AS e ON h.id = e.commit_id 
    INNER JOIN hf_repositories AS r ON r.id = e.repository_id 
    WHERE r.repository_name in ('tokenizers', 'transformers', 'datasets')""", conn)

df_software = pd.read_sql_query("""SELECT DISTINCT ON (h.commit_hash)  h.commit_message, h.commit_timestamp FROM hf_commits AS h 
    INNER JOIN events AS e ON h.id = e.commit_id 
    INNER JOIN hf_repositories AS r ON r.id = e.repository_id 
    WHERE r.repository_name in ('pyodide', 'papercups', 'nalgebra')""", conn)

df_ml = pd.read_sql_query("""SELECT DISTINCT ON (h.commit_hash)  h.commit_message, h.commit_timestamp FROM hf_commits AS h 
    INNER JOIN events AS e ON h.id = e.commit_id 
    INNER JOIN hf_repositories AS r ON r.id = e.repository_id 
    WHERE r.repository_name in ('tfx', 'torchx', 'mlflow')""", conn)

In [14]:
df_hf["commit_message"] = df_hf["commit_message"].apply(clean_text)
df_software["commit_message"] = df_software["commit_message"].apply(clean_text)
df_ml["commit_message"] = df_ml["commit_message"].apply(clean_text)

In [16]:
vect_hf = TfidfVectorizer(stop_words=stop_words, max_features=1000)
vect_hf_text = vect_hf.fit_transform(df_hf["commit_message"])

vect_software = TfidfVectorizer(stop_words=stop_words, max_features=1000)
vect_software_text = vect_software.fit_transform(df_software["commit_message"])

vect_ml = TfidfVectorizer(stop_words=stop_words, max_features=1000)
vect_ml_text = vect_ml.fit_transform(df_ml["commit_message"])

In [17]:
vocab_hf = vect_hf.get_feature_names_out()
vocab_software = vect_software.get_feature_names_out()
vocab_ml = vect_ml.get_feature_names_out()

In [18]:
#Hugging Face
search_params = {"n_components": [2, 3, 4, 5, 10, 15, 20], "learning_decay": [0.5, 0.7, 0.9]}

model_hf = perform_grid_search(search_params, vect_hf_text)
model_software = perform_grid_search(search_params, vect_software_text)
model_ml = perform_grid_search(search_params, vect_ml_text)

print(f"Best HF Parameters: {model_hf.best_params_}")
print(f"Best HF Log Likelihood Score: {model_hf.best_score_}")

print(f"Best Software Parameters: {model_software.best_params_}")
print(f"Best Software Log Likelihood Score: {model_software.best_score_}")

print(f"Best ML Parameters: {model_ml.best_params_}")
print(f"Best ML Log Likelihood Score: {model_ml.best_score_}")

KeyboardInterrupt: 

In [None]:
models_hf = calculate_perplexities(search_params, vect_hf_text)
models_hf = sorted(models_hf, key=lambda x: x["param_learning_decay"])

In [None]:
perplexities = [item["perplexity"] for item in models_hf]
results_hf = pd.DataFrame(model.cv_results_)
results_hf["perplexity"] = perplexities

In [None]:
models_software = calculate_perplexities(search_params, vect_hf_text)
models_software = sorted(models_software, key=lambda x: x["param_learning_decay"])

In [None]:
perplexities = [item["perplexity"] for item in models_software]
results_software = pd.DataFrame(model.cv_results_)
results_software["perplexity"] = perplexities

In [None]:
models_ml = calculate_perplexities(search_params, vect_ml_text)
models_ml = sorted(models_ml, key=lambda x: x["param_learning_decay"])

In [None]:
perplexities_software = [item["perplexity"] for item in models_software]
perplexities_hf = [item["perplexity"] for item in models_hf]
perplexities_ml = [item["perplexity"] for item in models_ml]

results_software = pd.DataFrame(model_software.cv_results_)
results_hf = pd.DataFrame(model_hf.cv_results_)
results_ml = pd.DataFrame(model_ml.cv_results_)

results_software["perplexity"] = perplexities_software
results_hf["perplexity"] = perplexities_hf
results_ml["perplexity"] = perplexities_ml

In [None]:
current_palette = sns.color_palette("tab10", 3)
fontsize = 16
pad = 20

fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(18, 8))
sns.lineplot(
    data=results_hf, 
    x="param_n_components", 
    y="mean_test_score",
    linewidth=2,
    markersize=8,
    hue="param_learning_decay", 
    marker="o",
    palette=current_palette,
    ax=ax1
)


ax1.set_title("LDA Model Selection\nHugging Face (Log Likelihood Scores)", pad=pad, fontsize=fontsize)
ax1.legend(title="Learning Rate", fancybox=True)
ax1.set_xlabel("Number of Components", fontsize=fontsize, labelpad=pad)
ax1.set_ylabel("Mean Test Score", fontsize=fontsize, labelpad=pad)
ax1.tick_params(axis="both", labelsize=fontsize)

sns.lineplot(
    data=results_hf, 
    x="param_n_components", 
    y="perplexity",
    linewidth=2,
    markersize=8,
    hue="param_learning_decay", 
    marker="o",
    palette=current_palette,
    ax=ax2
)

ax2.set_title("LDA Model Selection\nHugging Face(Perplexity Scores)", pad=pad, fontsize=fontsize)
ax2.legend(title="Learning Rate", fancybox=True)
ax2.set_xlabel("Number of Components", fontsize=fontsize, labelpad=pad)
ax2.set_ylabel("Perplexity", fontsize=fontsize, labelpad=pad)
ax2.tick_params(axis="both", labelsize=fontsize)

fig.savefig("plots/RQ3/RQ3_ModelSelectionHF.png", dpi=400, bbox_inches="tight")

In [None]:
current_palette = sns.color_palette("tab10", 3)
fontsize = 16
pad = 20

fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(18, 8))
sns.lineplot(
    data=results_software, 
    x="param_n_components", 
    y="mean_test_score",
    linewidth=2,
    markersize=8,
    hue="param_learning_decay", 
    marker="o",
    palette=current_palette,
    ax=ax1
)


ax1.set_title("LDA Model Selection\nSoftware (Log Likelihood Scores)", pad=pad, fontsize=fontsize)
ax1.legend(title="Learning Rate", fancybox=True)
ax1.set_xlabel("Number of Components", fontsize=fontsize, labelpad=pad)
ax1.set_ylabel("Mean Test Score", fontsize=fontsize, labelpad=pad)
ax1.tick_params(axis="both", labelsize=fontsize)

sns.lineplot(
    data=results_software, 
    x="param_n_components", 
    y="perplexity",
    linewidth=2,
    markersize=8,
    hue="param_learning_decay", 
    marker="o",
    palette=current_palette,
    ax=ax2
)

ax2.set_title("LDA Model Selection\nSoftware (Perplexity Scores)", pad=pad, fontsize=fontsize)
ax2.legend(title="Learning Rate", fancybox=True)
ax2.set_xlabel("Number of Components", fontsize=fontsize, labelpad=pad)
ax2.set_ylabel("Perplexity", fontsize=fontsize, labelpad=pad)
ax2.tick_params(axis="both", labelsize=fontsize)

fig.savefig("plots/RQ3/RQ3_ModelSelectionSoftware.png", dpi=400, bbox_inches="tight")

In [None]:
current_palette = sns.color_palette("tab10", 3)
fontsize = 16
pad = 20

fig, (ax1, ax2) = plt.subplots(ncols=2, nrows=1, figsize=(18, 8))
sns.lineplot(
    data=results_ml, 
    x="param_n_components", 
    y="mean_test_score",
    linewidth=2,
    markersize=8,
    hue="param_learning_decay", 
    marker="o",
    palette=current_palette,
    ax=ax1
)


ax1.set_title("LDA Model Selection\nML (Log Likelihood Scores)", pad=pad, fontsize=fontsize)
ax1.legend(title="Learning Rate", fancybox=True)
ax1.set_xlabel("Number of Components", fontsize=fontsize, labelpad=pad)
ax1.set_ylabel("Mean Test Score", fontsize=fontsize, labelpad=pad)
ax1.tick_params(axis="both", labelsize=fontsize)

sns.lineplot(
    data=results_ml, 
    x="param_n_components", 
    y="perplexity",
    linewidth=2,
    markersize=8,
    hue="param_learning_decay", 
    marker="o",
    palette=current_palette,
    ax=ax2
)

ax2.set_title("LDA Model Selection\nML(Perplexity Scores)", pad=pad, fontsize=fontsize)
ax2.legend(title="Learning Rate", fancybox=True)
ax2.set_xlabel("Number of Components", fontsize=fontsize, labelpad=pad)
ax2.set_ylabel("Perplexity", fontsize=fontsize, labelpad=pad)
ax2.tick_params(axis="both", labelsize=fontsize)

fig.savefig("plots/RQ3/RQ3_ModelSelectionML.png", dpi=400, bbox_inches="tight")

In [None]:
lda_best_model_hf = LatentDirichletAllocation(
    n_components=2, 
    learning_method="online", 
    max_iter=10, 
    n_jobs=1, 
    evaluate_every=1, 
    learning_decay=0.9,
    random_state=42
)

lda_best_topics_hf = lda_best_model_hf.fit_transform(vect_hf_text)

In [None]:
lda_best_model_software = LatentDirichletAllocation(
    n_components=2, 
    learning_method="online", 
    max_iter=10, 
    n_jobs=1, 
    evaluate_every=1, 
    learning_decay=0.9,
    random_state=42
)

lda_best_topics_software = lda_best_model_hf.fit_transform(vect_software_text)

In [None]:
lda_best_model_ml = LatentDirichletAllocation(
    n_components=2, 
    learning_method="online", 
    max_iter=10, 
    n_jobs=1, 
    evaluate_every=1, 
    learning_decay=0.9,
    random_state=42
)

lda_best_topics_ml = lda_best_model_hf.fit_transform(vect_ml_text)