Let's start by importing the necessary libraries.

In [1]:
%pip install PyGithub python-dotenv pandas tqdm cachetools scikit-learn nbformat
%pip install --upgrade nbformat

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


From the libraries `Github`, `pandas`, `dotenv`, `os` and `sklearn`, we will import the necessary functions to download the dataset, manipulate the data, and train the model.

In [2]:
from github import Github
import pandas as pd
from dotenv import load_dotenv
from os import getenv
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split

Let's set up the github account and define our main methods.

In [3]:
load_dotenv(override=True)
g = Github(getenv('GITHUB_TOKEN'), per_page=100)

In [4]:
def get_commit_messages(commits: pd.DataFrame, org: str, repo: str) -> pd.DataFrame:
    repo = g.get_organization(org).get_repo(repo)
    
    messages = []
    
    for commit in commits['id']:
        message = repo.get_commit(str(commit)).commit.message
        messages.append(message)
        
    return pd.DataFrame(messages, columns=['message'])

In [5]:
def get_pull_reqs_related_to_commits(commits: pd.DataFrame, org: str, repo: str) -> pd.DataFrame:
    repo = g.get_organization(org).get_repo(repo)
    pull_reqs = repo.get_pulls(state='all')
    
    data = []
    
    for commit in commits['id']:
        for pull_req in pull_reqs:
            if pull_req.merge_commit_sha == commit:
                data.append({
                    'pull_req_id': pull_req.number,
                    'pull_req_title': pull_req.title,
                    'pull_req_body': pull_req.body
                })

    return pd.DataFrame(data, columns=['pull_req_id', 'pull_req_title', 'pull_req_body'])

In [6]:
def get_directories(commits: pd.DataFrame, org: str, repo: str) -> pd.DataFrame:
    commit_dirs = []
    repo = g.get_organization(org).get_repo(repo)
    
    for commit in commits['id']:
        directories = set()
        for file in repo.get_commit(commit).files:
            file_path = file.filename
            if '/' in file_path:
                directory = '/'.join(file_path.split('/')[:-1])
                directories.add(directory)
            else:
                directories.add('.')
        commit_dirs.append(' '.join(directories))
    
    return pd.DataFrame(commit_dirs, columns=['dirs'])

Let's extract the data from all the specified repos and save them in a single dataframe cointaing their id, message, directory, opinion and pull_request infos.

In [7]:
repos = [('spring-guides', 'gs-accessing-data-jpa'), ('Azure-Samples', 'java-native-telemetry'),
        ('aws-samples', 'amazon-ivs-player-web-sample'), 
        ('aws-samples', 'aws-marketplace-serverless-saas-integration')]

classified_commits = pd.DataFrame(columns=['id', 'message', 'dirs', 'pull_req_id', 
                                           'pull_req_title', 'pull_req_body', 'opinion'])

for org, repo in repos:
    repo_data = pd.read_csv(f'{repo}.csv')
    
    messages_df = get_commit_messages(repo_data, org, repo)
    directories_df = get_directories(repo_data, org, repo)
    pull_reqs_df = get_pull_reqs_related_to_commits(repo_data, org, repo)
    
    combined_df = pd.concat([repo_data['id'], messages_df, directories_df, pull_reqs_df, repo_data['opinion']], axis=1)
    classified_commits = pd.concat([classified_commits, combined_df], ignore_index=True)

classified_commits.to_csv('classified_commits.csv', index=False)

  classified_commits = pd.concat([classified_commits, combined_df], ignore_index=True)


<div class="alert alert-block alert-info">
<b>Message training:</b> We will use the message of the commit as the main feature to train the model.
</div>

In [8]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(classified_commits['message'])

model = MultinomialNB()

model.fit(X, classified_commits['opinion'])

# Predict the opinions for all commit messages at once
classified_commits['predicted_opinion'] = model.predict(X)

# Calculate the accuracy
msg_accuracy = accuracy_score(classified_commits['opinion'], classified_commits['predicted_opinion'])

print(f'Accuracy = {msg_accuracy}')

Accuracy = 0.95


<div class="alert alert-block alert-info">
<b>Directory training:</b> We will use the file directories of the commit as the main feature to train the model.
</div>

In [9]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(classified_commits['dirs'])

model = MultinomialNB()

model.fit(X, classified_commits['opinion'])

# Predict the opinions for all commit dirs at once
classified_commits['predicted_opinion'] = model.predict(X)

# Calculate the accuracy
dirs_accuracy = accuracy_score(classified_commits['opinion'], classified_commits['predicted_opinion'])

print(f'Accuracy = {dirs_accuracy}')

Accuracy = 0.55


<div class="alert alert-block alert-info">
<b>Mixed training:</b> We will use the message and the file directories of the commit as the main features to train the model.
</div>

In [10]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(classified_commits['message'] + ' ' + classified_commits['dirs'])

model = MultinomialNB()

model.fit(X, classified_commits['opinion'])

# Predict the opinions for all commit messages at once
classified_commits['predicted_opinion'] = model.predict(X)

# Calculate the accuracy
combined_accuracy = accuracy_score(classified_commits['opinion'], classified_commits['predicted_opinion'])

print(f'Combined Accuracy = {combined_accuracy}')

the difference between the two accuracies is: 0.3999999999999999
Combined Accuracy = 0.8285714285714286


In [14]:
import plotly.express as px

# Create a DataFrame with the accuracy values

accuracy_df = pd.DataFrame({
    'Feature': ['Message', 'Dirs', 'Combined'],
    'Accuracy': [msg_accuracy, dirs_accuracy, combined_accuracy]
})

# Create a bar plot
fig = px.bar(
    accuracy_df, 
    x='Feature', 
    y='Accuracy', 
    text='Accuracy', 
    title='Accuracy of Different Features',
    labels={'Accuracy': 'Accuracy (%)'}
)

fig.show()

<div class="alert alert-block alert-info">
<b>Split Training:</b> We will use the opinion and dirs of the commit as the target to train the model.

</div>

In [11]:
class_counts = classified_commits['opinion'].value_counts()
filtered_classes = class_counts[class_counts >= 2].index
filtered_commits = classified_commits[classified_commits['opinion'].isin(filtered_classes)]

In [12]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(filtered_commits['message'] + ' ' + filtered_commits['dirs'])

# Use both 'opinion' and 'dirs' as labels in y
y = filtered_commits[['opinion', 'dirs']]

# Split the dataset into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y['opinion']  # Stratify based on the 'opinion' column
)


# Create and train the multi-output model
multi_output_model = MultiOutputClassifier(MultinomialNB())
multi_output_model.fit(X_train, y_train)

# Predict the opinions and dirs for the test set
y_pred = multi_output_model.predict(X_test)

# Calculate accuracy for each label (opinion and dirs) separately
opinion_accuracy = accuracy_score(y_test['opinion'], y_pred[:, 0])
dirs_accuracy = accuracy_score(y_test['dirs'], y_pred[:, 1])

# Calculate combined accuracy (optional)
combined_accuracy = (opinion_accuracy + dirs_accuracy) / 2

print(f'Opinion Accuracy = {opinion_accuracy}')
print(f'Dirs Accuracy = {dirs_accuracy}')
print(f'Combined Accuracy = {combined_accuracy}')

Opinion Accuracy = 0.4523809523809524
Dirs Accuracy = 0.30952380952380953
Combined Accuracy = 0.38095238095238093
