# histograms of similarities and transform similarities

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from consts import MID_CALC_PATH, DATA_PATH
import os

code_questions_similarity = pd.read_csv(os.path.join(MID_CALC_PATH, "code_questions_similarity.csv"))
code_questions_transformed_similarity = pd.read_csv(os.path.join(MID_CALC_PATH, "code_questions_transformed_similarity.csv"))

open_questions_similarity = pd.read_csv(os.path.join(MID_CALC_PATH, "open_questions_similarity.csv"))
open_questions_transformed_similarity = pd.read_csv(os.path.join(MID_CALC_PATH, "open_questions_transformed_similarity.csv"))

code_questions_heuristics = pd.read_csv(os.path.join(DATA_PATH, "top_code_questions.csv"))
open_questions_heuristics = pd.read_csv(os.path.join(DATA_PATH, "top_open_questions.csv"))

In [0]:
import numpy as np

all_data = np.concatenate([
    code_questions_similarity["similarity"], 
    code_questions_transformed_similarity["similarity"], 
    code_questions_heuristics["heuristic_score"],
    open_questions_similarity["similarity"], 
    open_questions_transformed_similarity["similarity"], 
    open_questions_heuristics["heuristic_score"]
])

all_code_data = np.concatenate([
    code_questions_similarity["similarity"], 
    code_questions_transformed_similarity["similarity"], 
    code_questions_heuristics["heuristic_score"]
])

# Determine common bins based on the combined data range
all_open_data = np.concatenate([
    open_questions_similarity["similarity"], 
    open_questions_transformed_similarity["similarity"], 
    open_questions_heuristics["heuristic_score"]
])

# Get min and max values
min_value, max_value = np.min(all_data), np.max(all_data)

print("Min value:", min_value)
print("Max value:", max_value)

# Histogram of code questions similarities (transformed and not)

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Determine common bins based on the combined data range
all_code_data_similarities = np.concatenate([code_questions_similarity["similarity"], 
                           code_questions_transformed_similarity["similarity"]])

bins = np.histogram_bin_edges(all_code_data_similarities, bins=20)  # Use the same bins for both

plt.figure(figsize=(10, 6))

# Plot both histograms with shared bins
sns.histplot(code_questions_similarity["similarity"], bins=bins, kde=True, color="blue", alpha=0.5, label="Code Questions Similarity")
sns.histplot(code_questions_transformed_similarity["similarity"], bins=bins, kde=True, color="red", alpha=0.5, label="Transformed Code Questions Similarity")

plt.xlabel("Similarity Score", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Histogram of Code Questions Similarity Scores", fontsize=14)
plt.legend()
plt.xlim(min_value, max_value)
plt.show()

# Histogram of open questions similarities (transformed and not)

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Determine common bins based on the combined data range
all_open_data_similarities = np.concatenate([open_questions_similarity["similarity"], 
                           open_questions_transformed_similarity["similarity"]])

bins = np.histogram_bin_edges(all_open_data_similarities, bins=20)  # Use the same bins for both

plt.figure(figsize=(10, 6))

# Plot both histograms with shared bins
sns.histplot(open_questions_similarity["similarity"], bins=bins, kde=True, stat='frequency', color="blue", alpha=0.5, label="Open Questions Similarity")
sns.histplot(open_questions_transformed_similarity["similarity"], bins=bins, kde=True, stat='frequency', color="red", alpha=0.5, label="Transformed Open Questions Similarity")

plt.xlabel("Similarity Score", fontsize=12)
plt.ylabel("Frequency", fontsize=12)
plt.title("Histogram of Open Questions Similarity Scores", fontsize=14)
plt.legend()
plt.xlim(min_value, max_value)
plt.show()

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from consts import DATA_PATH
import os

code_questions_heuristics = pd.read_csv(os.path.join(DATA_PATH, "top_code_questions.csv"))
open_questions_heuristics = pd.read_csv(os.path.join(DATA_PATH, "top_open_questions.csv"))

def historgram(df_pandas, col, title):
    # Plot the histogram
    plt.figure(figsize=(10, 6))
    sns.histplot(df_pandas["heuristic_score"], bins=20, kde=True, stat='frequency', color=col)

    # Labels and title
    plt.xlabel("Average Job Similarity Score", fontsize=12)
    plt.ylabel("Frequency", fontsize=12)
    plt.title(f"Histogram of {title} Heuristic Scores", fontsize=14)

    # Show the plot
    plt.show()

historgram(code_questions_heuristics, 'deepskyblue', 'Code Questions')
historgram(open_questions_heuristics, 'deepskyblue', 'Open Questions')


# Code data similarities & heuristic histograms

In [0]:
code_bins = np.histogram_bin_edges(all_code_data, bins=20)  # Use the same bins for all

# Create the plot
plt.figure(figsize=(10, 6))

# Plot histograms
sns.histplot(code_questions_similarity["similarity"], bins=code_bins, kde=True, stat="density", color="blue", alpha=0.5, label="Code Questions Similarity")
sns.histplot(code_questions_transformed_similarity["similarity"], bins=code_bins, kde=True, stat="density", color="purple", alpha=0.5, label="Transformed Code Questions Similarity")
sns.histplot(code_questions_heuristics["heuristic_score"], bins=code_bins, kde=True, stat="density", color="deepskyblue", alpha=0.5, label="Heuristic Score Code Questions ")

# Labels and title
plt.xlabel("Score", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.title("Histogram of Code Questions Similarity & Heuristic Scores", fontsize=14)
plt.legend()
# Set x-axis scale
plt.xlim(min_value, max_value)

# Show the plot
plt.show()

# Open data similarities & heuristic histograms

In [0]:
# plotting open data histogram

# Define bins
open_bins = np.histogram_bin_edges(all_open_data, bins=20)  # Use the same bins for all

# Create the plot
plt.figure(figsize=(10, 6))

# Plot histograms
sns.histplot(open_questions_similarity["similarity"], bins=open_bins, kde=True, stat="density", color="blue", alpha=0.5, label="Open Questions Similarity")
sns.histplot(open_questions_transformed_similarity["similarity"], bins=open_bins, kde=True, stat="density", color="purple", alpha=0.5, label="Transformed Open Questions Similarity")
sns.histplot(open_questions_heuristics["heuristic_score"], bins=open_bins, kde=True, stat="density", color="deepskyblue", alpha=0.5, label="Heuristic Score Open Questions")

# Labels and title
plt.xlabel("Score", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.title("Histogram of Open Questions Similarity & Heuristic Scores", fontsize=14)
plt.legend()

# Set x-axis scale
plt.xlim(min_value, max_value)

# Show the plot
plt.show()


In [0]:
import pandas as pd
import os
from consts import DATA_PATH

# Load datasets
code_questions = pd.read_csv(os.path.join(DATA_PATH, "top_code_questions.csv")).drop_duplicates()
open_questions = pd.read_csv(os.path.join(DATA_PATH, "top_open_questions.csv")).drop_duplicates()

code_questions.display()