# Data Collection

## GitHub credentials

A private access token is necessary to make use of less restrictive API limits.

In [1]:
from github import RateLimitExceededException, Github

# Providing access token
access_token = ""
g = Github(login_or_token=access_token)

# Confirm your login is successful
user = g.get_user()
print(f"Authenticated as: {user.login}")

Authenticated as: AbelvdTil


## Data files

Path to output files.

In [2]:
import os
STEP1_HCLREPOS =   os.path.join("data2", "step1-hcl-repositories.txt")
STEP2_TFREPOS =    os.path.join("data2", "step2-tf-repositories.txt")
STEP2_404REPOS =   os.path.join("data2", "step2-404-repositories.txt")
STEP3_KWCOMMITS =  os.path.join("data2", "step3-keyword-commits.json")
STEP3_ERRORREPOS = os.path.join("data2", "step3-error-repositories.txt")
STEP4_TFCOMMITS =  os.path.join("data2", "step4-tf-commits.json")
STEP5_TFISSUES =  os.path.join("data2", "step5-tf-issues.json")
STEP5_TFISSUES_ERROR =  os.path.join("data2", "step5-tf-issues-error-list.json")

## General Steps

The steps to collect the data used for both RQ1 (commits) and RQ2 (issues).

### Step 1 - Recover GitHub repositories containing HCL IaC

For each day from 2014, query the GitHub search API for repositories that use HCL as language.
Some dates queried do not exist, an exception is caught to avoid interruptions.

Every repository is saved in '`data/step1-hcl-repositories.txt`' so no progress is lost in case of interruptions.

In [5]:
import time

script_urls = []
for year in range(2014, 2023):
    for month in range(1, 13):
        print(f"Scraping month {month} of year {year}")
        for day in range(1, 32):
            # Formatting compatible with search parameters
            date = f"{year}-{month:02d}-{day:02d}"
            try:
                time.sleep(2)  # sleep to reset API search limit
                repos = g.search_repositories(query=f"created:{date} language:HCL")
                for repo in repos:
                    time.sleep(0.2)  # sleep to reset API core limit
                    # URLs are added to a txt file to avoid data loss
                    with open(STEP1_HCLREPOS, "a") as file:
                        file.write(f"{repo.clone_url}\n")
                    script_urls.append(repo.clone_url)
            except RateLimitExceededException:
                print("Rate Limit Exception reached!")
            except Exception as e:
                print(e)
                # These are impossible dates (31-2-2022)
                print(f"Skipping: {date}")

Scraping month 1 of year 2014


KeyboardInterrupt: 

In [None]:
# Number of HCL repositories obtained
print(len(script_urls))

### Step 2 - Filter repositories with Terraform files

Read the repositories from the previous step.

In [3]:
# read urls from the file and strip the '\n'
gitUrls_file = open(STEP1_HCLREPOS, "r")
repo_links = gitUrls_file.readlines()
repo_links = [repo.strip() for repo in repo_links]

Scan the content of each repository looking for files with extension '`.tf`' and '`.tf.json`' (i.e., Terraform artifact files).

Suitable repositories are saved in '`data/step2-tf-repositories.txt`'.

Repositories that are not reachable for any reason are saved in '`data/step2-404-repositories.txt`'.

In [17]:
def current_milli_time():
    return round(time.time() * 1000)

lastTime = current_milli_time()

counter = 0
terraform_keywords = ['.tf', '.tf.json']
terraform_relevant_repos = []
for repo_url in repo_links:
    if counter % 100 == 0:
        print(f'Got to {counter}')
    try:
        # check if last operation was faster than 750 milliseconds, 
        # if so wait until at least 750 milliseconds since last api call.
        if (lastTime + 750 > current_milli_time()):
            time.sleep((lastTime + 750 - current_milli_time()) / 1000)  # sleep for API search limit
        lastTime = current_milli_time()
        
        split_list = repo_url.split("/")
        actual_url = split_list[3]+ '/' + split_list[4]
        repo = g.get_repo(actual_url.split('.git')[0])
        contents = repo.get_contents('')
        while contents:
            file_content = contents.pop(0)
            if file_content.type == "dir":
                contents.extend(repo.get_contents(file_content.path))
            else:
                if file_content.name is not None and any(key in file_content.name.lower() for key in terraform_keywords):
                    terraform_relevant_repos.append(repo_url)
                    with open(STEP2_TFREPOS, "a") as file:
                        file.write(f"{repo_url}\n")
                    break
        counter += 1
    except RateLimitExceededException:
        print("Rate Limit Exception reached!")
    except Exception as e:
        print(f"{e}\n{repo_url}")
        with open(STEP2_404REPOS, "a") as file:
            file.write(f"{repo_url}\n")

Got to 0
Got to 100
Got to 200
Got to 300
Got to 400
Got to 500
Got to 600
404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest/repos/repos#get-a-repository"}
https://github.com/davidjyeo/az_multi_region.git


Request GET /repos/aidapsibr/budget-az-network failed with 403: Forbidden
Setting next backoff to 707.450359s


Got to 700
Got to 800
Got to 900
Got to 1000
Got to 1100
Got to 1200
Got to 1300
404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest"}
https://github.com/FM1337/.github.git
Got to 1400
Got to 1500
Got to 1600
Got to 1700
Got to 1800
Got to 1900


Request GET /repos/ssoogur/TerraformExecute failed with 403: Forbidden
Setting next backoff to 1338.896339s


Got to 2000
Got to 2100
404 {"message": "Not Found", "documentation_url": "https://docs.github.com/rest"}
https://github.com/LandmakTechnology/.github-workflows-terra.git
Got to 2200
Got to 2300
Got to 2400
Got to 2500


Request GET /repos/RoshanFathima/FinalProjectGroup13/contents/prod failed with 403: Forbidden
Setting next backoff to 1369.540823s


KeyboardInterrupt: 

### Step 3 - Extract commits with cost-related keywords

Read the previously filtered Terraform repositories. 

Then style a keyword list meant to be used in the commit message filtering phase.

In [18]:
# read urls from the file and strip the '\n'
all_repos = open(STEP2_TFREPOS, "r")
repo_links = [repo.strip() for repo in all_repos.readlines()]
cost_keywords = ["cheap", "expens", "cost", "efficient", "bill", "pay"]
print(len(repo_links))

189512


Using PyDriller, traverse the commits of each repository.

For every commit containing one or more keywords in its message, extract **commit id**, **message**, **date** and **list of modified files**.

The final list of extracted commits is saved as JSON in '`data/step3-keyword-commits.json`'.

If an error occur while trying to access a commit, the repository URL is saved in '`data/step3-error-repositories.txt`'.

> NOTES:
> - To foster privacy, we only save the information needed for the study

In [4]:
import json
from pydriller import Repository

relevant_repos = []
count = 0
for repo in repo_links:
    commits = []

    if count % 100 == 0:
        print("Got to {}".format(count))

    try:
        # For each commit in the repository
        for commit in Repository(repo).traverse_commits():
            # If any of the keyword appear in the commit message
            if commit.msg is not None and any(key in commit.msg.lower() for key in cost_keywords):
                changed_files = []
                # Save the modified files
                for file in commit.modified_files:
                    changed_files.append(file.filename)
                commit_dic = {"id": commit.hash, 
                              "msg":commit.msg, 
                              "date":str(commit.author_date),
                              "modified_files": changed_files}
                commits.append(commit_dic)
        repo_dic = {"name":repo, "commits":commits}

        # Mark the repository as relevant if it has any relevant commits
        if len(commits) != 0:
            relevant_repos.append(repo_dic)
    except Exception as e:
        # so that we document what errors can happen when accessing commits
        print(f"{e}\n{repo}")
        with open(STEP3_ERRORREPOS, "a") as file:
            file.write(f"{repo}\n")
    count = count + 1

output = {"no_of_repos":len(relevant_repos) ,"repositories": relevant_repos}
with open(STEP3_KWCOMMITS, "w") as outfile:
    json.dump(output, outfile)

NameError: name 'repo_links' is not defined

## RQ1 - Commit Collection

From the remaning tasks for RQ1, the only automation was the selection of commits that modify Terraform files.

That means, the removal of commits from forks, the filtering by relevance and the coding were manual steps and are not covered in this script.

The final result (i.e., set of units of analysis pertaining to commits) can be found in the file '`dataset.json`'.

### Filter commits that modify Terraform files 

Refines the previous JSON file so that only commits that modify '`.tf`' and '`.tf.json`' files are taken into consideration.

The final list of filtered commits is saved as JSON in '`data/step4-tf-commits.json`'.

In [5]:
# Opening JSON file
terraform_output = open(STEP3_KWCOMMITS)
selected_repos = json.load(terraform_output)

filtered_repos = []
terraform_keywords = ['.tf', '.tf.json']
print(len(selected_repos["repositories"]))

for repo in selected_repos["repositories"]:
    relevant_commits = []
    flag = False
    for commit in repo["commits"]:
        for mod_file in commit["modified_files"]:
            if mod_file is not None and any(key in mod_file for key in terraform_keywords):
                relevant_commits.append(commit)
                flag = True
                break

    if flag:
        # new_commit_repo = {"name":repo["name"], "commits":relevant_commits}
        repo["commits"] = relevant_commits
        filtered_repos.append(repo)



print(f"Identified {len(filtered_repos)}")

output = {"no_of_repos":len(filtered_repos) ,"repositories": filtered_repos}
with open(STEP4_TFCOMMITS, "w") as outfile:
    json.dump(output, outfile)

2945
Identified 1485


In [15]:
with open(STEP4_TFCOMMITS, "r") as input1:
    relevant_repos = json.load(input1)

commits = []
for repo in relevant_repos["repositories"]:
    for com in repo["commits"]:
        commits.append(com["id"])
print(len(commits))
print(len(set(commits)))


3682
2993


## RQ2 - Issue Collection

From the remaning tasks for RQ2, the only automation was the collection of issues that contain one or more keywords.

That means, the filtering by relevance and the coding were manual steps and are not covered in this script.

The final result (i.e., set of units of analysis pertaining to issues) can be found in the file '`dataset.json`'.

### Extracting issues from repositories with keywords

Take the repository URLs from all the commits that contain at least one of our keywords (i.e., from '`data/step3-keyword-commits.json`').

Use Perceval to extract any issue that contains a cost-related keyword in either the title, body or comments.

This process is time-consuming. If the GitHub API limit is reached, a proper waiting time is set by calculating how long it takes to reset the limits.

The final list of collected issues is saved as JSON in '`step5-tf-issues.json`'.

If an error occur while trying to extract an issue, the repository URL is saved in '`step5-tf-issues-error-list.json`' for re-assessment and re-download (if applicable).

> NOTES:
> - To foster privacy, we only save the information needed for the study
> - The output file provided in this repository has been anonymized and manually inspected to replace usernames in text with '`@user`'.

In [None]:
import time
import json
import calendar
from perceval.backends.core.github import GitHub as pGithub
from github import Github
from pathlib import Path

# Opening JSON file
terraform_output = open(STEP3_KWCOMMITS)
selected_repos = json.load(terraform_output)

# Get list of repositories that contain commits with related keywords (i.e., step 3 output)
terraform_keyworded_urls = [entry['name'] for entry in selected_repos['repositories']]

# Extract issues from the repositories
relevant_repos = []
count = 0
for repo in terraform_keyworded_urls:
    print(repo)
    time.sleep(2)
    count += 1
    try:
        if count % 50 == 0:
            time.sleep(120)
            print(f"At: {count}")

        # Extracting owner username and repository name from the URL
        owner = repo.split('/')[3]
        repository = ".".join(repo.split('/')[4].split('.')[:-1])
        fetched = pGithub(owner=owner, repository=repository, api_token=[access_token])
        issue_list = []

        for item in fetched.fetch():
            time.sleep(1)
            # do not save pull requests
            if 'pull_request' in item_data:
                continue
            
            # initialize all entries in case of empty fields (to prevent errors)
            item_data = item['data']
            title_flag = False
            body_flag = False
            comment_flag = False

            # If title, body or comments contain any of the keywords, then the issue is relevant
            if 'title' in item_data and item_data['title'] is not None and any(key in item_data['title'] for key in cost_keywords):
                title_flag = True
            elif 'body' in item_data and item_data['body'] is not None and any(key in item_data['body'] for key in cost_keywords):
                    body_flag = True
            elif 'comments_data' in item_data and 'comments_data' is not None:
                for comment in item_data['comments_data']:
                    if 'body' in comment and comment['body'] is not None and any(key in comment['body'] for key in cost_keywords):
                        comment_flag = True
                        break

            time.sleep(1)

            if title_flag or body_flag or comment_flag:
                print("-adding elements to dictionary")
                issue_dict = {
                'title': '' if 'title' not in item_data else item_data['title'],
                'html_url': None if 'html_url' not in item_data else item_data['html_url'],
                'body': '' if 'body' not in item_data else item_data['body'],
                'comments_data': [] if 'comments_data' not in item_data else item_data['comments_data']
                }
                issue_dict['comments_data'] = [c['body'] for c in issue_dict['comments_data'] if c.get('body')]
                issue_list.append(issue_dict)

        repo_dic = {"name":repo, "issues":issue_list}
        if len(issue_list) > 0:
            relevant_repos.append(repo_dic)

    except Exception as e:
        print(f"Repo: {repo} failed")
        print(e)
        core_rate_limit = g.get_rate_limit().core
        reset_timestamp = calendar.timegm(core_rate_limit.reset.timetuple())
        sleep_time = reset_timestamp - calendar.timegm(time.gmtime()) + 5  # add 5 seconds to be sure the rate limit has been reset
        time.sleep(sleep_time)
        with open(STEP5_TFISSUES_ERROR, "a") as file:
            file.write(f"{repo}\n")


output = {"no_of_repos":len(relevant_repos) ,"repositories": relevant_repos}
with open(STEP5_TFISSUES, "w") as outfile:
    json.dump(output, outfile)