In [1]:
%pip install PyGithub python-dotenv pandas tqdm cachetools scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from github import Github
import pandas as pd
from dotenv import load_dotenv
from os import getenv
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
load_dotenv(override=True)
g = Github(getenv('GITHUB_TOKEN'), per_page=100)

In [4]:
def get_commit_messages(commits: pd.DataFrame, org: str, repo: str) -> pd.DataFrame:
    repo = g.get_organization(org).get_repo(repo)
    
    messages = []
    
    for commit in commits.itertuples():
        message = repo.get_commit(str(commit[1])).commit.message
        messages.append(message)
        
    return pd.DataFrame(messages, columns=['message'])

In [5]:
def get_pull_reqs_related_to_commits(commits: pd.DataFrame, org: str, repo: str) -> pd.DataFrame:
    repo = g.get_organization(org).get_repo(repo)
    pull_reqs = repo.get_pulls(state='all')
    
    data = []
    
    for commit in commits.itertuples():
        for pull_req in pull_reqs:
            if pull_req.merge_commit_sha == commit[1]:
                data.append({
                    'pull_req_id': pull_req.number,
                    'pull_req_title': pull_req.title,
                    'pull_req_body': pull_req.body
                })
        
    return pd.DataFrame(data, columns=['pull_req_id', 'pull_req_title', 'pull_req_body'])

In [6]:
def get_directories(commits: pd.DataFrame, org: str, repo: str) -> pd.DataFrame:
    commit_dirs = []
    repo = g.get_organization(org).get_repo(repo)
    
    for commit in commits.itertuples():
        directories = []
        for file in repo.get_commit(commit[1]).files:
            file_path = file.filename
            if '/' in file_path:
                directory = '/'.join(file_path.split('/')[:-1])
                if directory not in directories:
                    directories.append(directory)
            else:
                directories.append('empty')
        if directories:
            commit_dirs.append(' '.join(directories))
    return commit_dirs

In [7]:
classified_commits = pd.read_csv('java-native-telemetry.csv')
classified_commits['dirs'] = get_directories(classified_commits, 'Azure-Samples', 'java-native-telemetry')
pull_req_data = get_pull_reqs_related_to_commits(classified_commits, 'Azure-Samples', 'java-native-telemetry')
classified_commits = pd.concat([classified_commits, pull_req_data[['pull_req_id', 'pull_req_title', 'pull_req_body']]], axis=1)
classified_commits.to_csv('java-native-telemetry_full.csv', index=False)

In [8]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(classified_commits['message'])

model = MultinomialNB()

model.fit(X, classified_commits['opinion'])

# Predict the opinions for all commit messages at once
classified_commits['predicted_opinion'] = model.predict(X)

# Calculate the accuracy
msg_accuracy = accuracy_score(classified_commits['opinion'], classified_commits['predicted_opinion'])

print(f'Accuracy = {msg_accuracy}')

Accuracy = 0.8857142857142857


In [9]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(classified_commits['dirs'])

model = MultinomialNB()

model.fit(X, classified_commits['opinion'])

# Predict the opinions for all commit dirs at once
classified_commits['predicted_opinion'] = model.predict(X)

# Calculate the accuracy
dirs_accuracy = accuracy_score(classified_commits['opinion'], classified_commits['predicted_opinion'])

print(f'Accuracy = {dirs_accuracy}')

Accuracy = 0.5142857142857142


In [10]:
print('the difference between the two accuracies is:', msg_accuracy - dirs_accuracy)

##combining the two features

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(classified_commits['message'] + ' ' + classified_commits['dirs'])

model = MultinomialNB()

model.fit(X, classified_commits['opinion'])

# Predict the opinions for all commit messages at once
classified_commits['predicted_opinion'] = model.predict(X)

# Calculate the accuracy
combined_accuracy = accuracy_score(classified_commits['opinion'], classified_commits['predicted_opinion'])

print(f'Combined Accuracy = {combined_accuracy}')

the difference between the two accuracies is: 0.37142857142857144
Combined Accuracy = 0.5714285714285714
