In [25]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import nltk
import seaborn as sns

from nltk.corpus import stopwords
from pprint import pprint
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sqlalchemy import create_engine
from scipy.stats import describe
from typing import Dict, List, Union

from scripts.issues_utils import merge_issues_comments
from scripts.stats_utils import calculate_four_moments
from scripts.lda_utils import (
    clean_text, 
    calculate_perplexities, 
    extract_dominant_topics, 
    create_cat_dataframe,
    perform_grid_search,
    generate_wordcloud
)

%matplotlib inline
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")    
nltk.download("omw-1.4")

stop_words = set(nltk.corpus.stopwords.words("english"))

[nltk_data] Downloading package stopwords to /home/elang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/elang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/elang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/elang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
conn = create_engine("postgresql://root:password@localhost:5435/hf")

In [14]:
df_issues_software = pd.read_sql_query("""
    SELECT * 
    FROM issues AS i
    INNER JOIN github_repositories AS g ON
        i.github_repository_id = g.id
    AND 
        g.repository_type = 1
""", con=conn)

df_issues_hf = pd.read_sql_query("""
    SELECT * 
    FROM issues AS i
    INNER JOIN github_repositories AS g ON
        i.github_repository_id = g.id
    wHERE 
        g.repository_type = 2
""", con=conn)

df_issues_ml = pd.read_sql_query("""
    SELECT * 
    FROM issues AS i
    INNER JOIN github_repositories AS g ON
        i.github_repository_id = g.id
    WHERE 
        g.repository_type = 3
""", con=conn)


In [20]:
df_comments_software = pd.read_sql_query("""
    SELECT * 
    FROM issue_comments AS c
    INNER JOIN issues AS i ON
        i.id = c.issue_id
    INNER JOIN github_repositories AS g ON
        i.github_repository_id = g.id
    WHERE 
        g.repository_type = 1
""", con=conn)

df_comments_hf = pd.read_sql_query("""
    SELECT * 
    FROM issue_comments AS c
    INNER JOIN issues AS i ON
        i.id = c.issue_id
    INNER JOIN github_repositories AS g ON
        i.github_repository_id = g.id
    WHERE 
        g.repository_type = 2
""", con=conn)

df_comments_ml = pd.read_sql_query("""
    SELECT * 
    FROM issue_comments AS c
    INNER JOIN issues AS i ON
        i.id = c.issue_id
    INNER JOIN github_repositories AS g ON
        i.github_repository_id = g.id
    WHERE 
        g.repository_type = 3
""", con=conn)

In [33]:
docs_software = merge_issues_comments(df_issues_software, df_comments_software)
docs_hf = merge_issues_comments(df_issues_hf, df_comments_hf)
docs_ml = merge_issues_comments(df_issues_ml, df_comments_ml)

In [34]:
vect_software = TfidfVectorizer(stop_words=stop_words, max_features=1000)
vect_hf = TfidfVectorizer(stop_words=stop_words, max_features=1000)
vect_ml = TfidfVectorizer(stop_words=stop_words, max_features=1000)

vect_text_software = vect_software.fit_transform(docs_software["document"])
vect_text_hf = vect_hf.fit_transform(docs_hf["document"])
vect_text_ml = vect_ml.fit_transform(docs_ml["document"])

In [None]:
vocab_software = vect_software.get_feature_names_out()
vocab_hf = vect_hf.get_feature_names_out()
vocab_ml = vect_ml.get_feature_names_out()

In [None]:
search_params = {"n_components": [2, 3, 4, 5, 10, 15, 20], "learning_decay": [0.5, 0.7, 0.9]}

model_software = perform_grid_search(search_params, vect_text_software)
model_hf = perform_grid_search(search_params, vect_text_hf)
model_ml = perform_grid_search(search_params, vect_text_ml)

print(f"Best Parameters Software: {model_software.best_param_}")
print(f"Best Log Likelihood Software: {model_hf.best_score_}")

print(f"Best Parameters HF: {model_hf.best_param_}")
print(f"Best Log Likelihood Software: {model_hf.best_score_}")

print(f"Best Parameters ML: {model_ml.best_param_}")
print(f"Best Log Likelihood ML: {model_ml.best_score_}")

In [None]:
models_software = calculate_perplexities(search_params, vect_text_software)
models_hf = calculate_perplexities(search_params, vect_text_hf)
models_ml = calculate_perplexities(search_params, vect_text_ml)

In [None]:
perplexities_software = [item["perplexity"] for item in models_software]
perplexities_hf = [item["perplexity"] for item in models_hf]
perplexities_ml = [item["perplexity"] for item in models_ml]

results_software = pd.DataFrame(model_software.cv_results_)
results_hf = pd.DataFrame(model_hf.cv_results_)
results_ml = pd.DataFrame(model_ml.cv_results_)

results_software["perplexity"] = perplexities_software
results_hf["perplexity"] = perplexities_hf
results_ml["perplexity"] = perplexities_ml

In [None]:
current_palette = sns.color_palette("tab10", 3)
fontsize = 16
pad = 20


fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(18, 8))