<div class='alert alert-box alert-info'>
    <h1>Results</h1>
    <p>The code cells below will generate a <b>.csv</b> file with the metrics of the organizations in <b>codesamples.csv</b>.</p>
</div>

In [25]:
%pip install pandas matplotlib seaborn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
def calculate_language_usage(dataframe):
    language_usage = {}

    for repo in dataframe.itertuples():
        repo_languages = repo.langs_percentage
        total_lines = repo.total_lines

        for language, percentage in repo_languages.items():
            lines = total_lines * (float(percentage.strip('%')) / 100)
            language_usage[language] = language_usage.get(language, 0) + lines

    total_lines_of_code = sum(language_usage.values())

    formatted_languages = sorted(
        [(language, f'{(lines / total_lines_of_code):.2%}', int(lines)) for language, lines in language_usage.items()],
        key=lambda item: float(item[1].strip('%')),
        reverse=True
    )

    file_path = f"results/languages_usage.csv"
    dataframeLanguages = pd.DataFrame(formatted_languages, columns=["Language", "Usage Percentage", "Lines"])
    dataframeLanguages.to_csv(file_path, index=False)
    return dataframeLanguages


In [28]:
def generate_statistics(dataframe):
    numeric_metrics = dataframe[METRICS].select_dtypes(include='number')
    
    statistics = numeric_metrics.describe().transpose()
    
    statistics['median'] = numeric_metrics.median()
    statistics['mode'] = numeric_metrics.mode().iloc[0]
    
    file_path = f"results/statistics.csv"
    statistics.to_csv(file_path)
    
    return statistics

In [29]:
def plot_metrics_distribution(metricts):
    plt.figure(figsize=(16, 12))

    for i, metric in enumerate(metricts):
        plt.subplot(3, 3, i + 1)
        sns.histplot(dataframe[metric], kde=True, bins=30, color='blue', alpha=0.6)
        plt.axvline(dataframe[metric].mean(), color='r', linestyle='dashed', linewidth=1, label='Mean')
        plt.axvline(dataframe[metric].median(), color='g', linestyle='dashed', linewidth=1, label='Median')
        plt.title(f'Distribution of {metric}')
        plt.xlabel(metric)
        plt.ylabel('Frequency')
        plt.legend()

    plt.tight_layout()
    plt.savefig('results/distribution_metrics.png')
    plt.show()

In [30]:
def plot_metrics_boxplot(metricts):
    numeric_metrics = dataframe[metricts].select_dtypes(include='number')
    
    plt.figure(figsize=(16, 12))
    for i, metric in enumerate(numeric_metrics.columns):
        plt.subplot(3, 3, i + 1)
        sns.boxplot(x=numeric_metrics[metric], color='lightblue')
        plt.title(f'Boxplot of {metric}')
        plt.xlabel(metric)
    
    plt.tight_layout()
    plt.savefig('results/boxplot_metrics.png')
    plt.show()

In [31]:
def plot_metrics_statistics(metricts):
    desc_stats = dataframe[metricts].describe().T[['mean', '50%', 'std']]
    desc_stats.columns = ['Mean', 'Median', 'Standard Deviation']
    desc_stats.plot(kind='bar', figsize=(12, 6))
    plt.title('Descriptive Statistics of Repository Metrics')
    plt.ylabel('Values')
    plt.xticks(rotation=0)
    plt.savefig('results/metrics_statistics.png')
    plt.show()

In [32]:
METRICS = [
    'size', 
    'stargazers_count', 
    'watchers_count', 
    'forks_count',
    'subscribers_count', 
    'open_issues_count',
    'network_count',
    'total_lines'
]

dataframe = pd.read_csv('codesamples.csv')

In [33]:
dataframeLanguages = calculate_language_usage(dataframe)
dataframeLanguages

In [34]:
dataframe_statistics = generate_statistics(dataframe)
dataframe_statistics

KeyError: "['open_issues_count', 'network_count'] not in index"

In [None]:
plot_metrics_distribution(METRICS)
plot_metrics_boxplot(METRICS)
plot_metrics_statistics(METRICS)