# Fetching Data

This file is pulling all the necessary informations from the github repositories and formatting it into the following JSON format:
- Name of Issue
- ID of the Issue
- Name of the Repo
- ID of the Repo
- Code Changes

In [17]:
# Parameters for the Notebook
# You can set the parameters to the ones you need before running the entire notebook

github_url: str = ""                # url of the repo, set only if you want a manual issue
manual_issue: bool = False          # set to True if you have a specific url to run
first_run: bool = False             # set to False once you ran the entire notebook once already
path_for_course: str = ""           # String with the full path where the course folder should be created

# Set-Up of the Notebook

import pandas as pd
import numpy as np
import os
from github import Github
from dotenv import load_dotenv

load_dotenv()
GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
GITHUB_OBJECT = Github(GITHUB_TOKEN)




### Creating the Database

We want to select the proper issues from all the top repositories to create the courses from. Thus we are looking for issues that match all the following criteria:

- Is from a JS/TS repository
- Is a closed issue
- Is a good first issue
- Only has one pull request
- Is an issue that only makes code changes

The reasoning behind those criteria is on one hand, to have a relevant coding issue to give to students who work with open source software for the first time, and on the other hand to make it easier for us to compute all the exercises.

In [3]:
reactRepos = GITHUB_OBJECT.search_repositories(query=f'topic:react language:JavaScript language:TypeScript', sort="stars", order="desc")

In [34]:
def select_issues(manual_issue: bool, repos, issue_url: str):
    """
    This function outputs a DataFrame consisting of all issues that follow the following criteria
    - Only good first issues
    - Only issues that don't have something to do with the documentation
    """

    filtered_words = ["doc", "document", "DOC", "Doc", "Document", "docs", "Docs", "DOCS", "Docs", "style", "readme", "README"]
    issue_list = []

    if not manual_issue:
        for inx, repo in enumerate(repos):

            # if repo.size > 100000 or repo.size <= 50000:
            #     continue

            issues = repo.get_issues(state="closed", labels=["good first issue"])
            for indx, issue in enumerate(issues):
                flag: bool = False
                if issue.pull_request != None:
                    continue
                for word in filtered_words:
                    if word in issue.title:
                        flag = True
                        break 
                if flag:
                    break

                issue_list.append({"name": issue.title, "repo": repo, "issue": [issue]})

            if inx == 99:
                break

        issue_dataframe = pd.DataFrame(issue_list)
        issue_dataframe.to_csv(path_or_buf="data/selected_issues.csv")
        
        return issue_dataframe

selected_issues = select_issues(manual_issue, reactRepos, github_url)

1 0
1 1
1 2
1 3
1 4
1 5
1 6
1 7
1 8
1 9
1 10
1 11
1 12
1 13
1 14
1 15
1 16
1 17
1 18
1 19
1 20
2 0
2 1
3 0
3 1
4 0
6 0
6 1
6 2
8 0
8 1
8 2
8 3
8 4
8 5
8 6
8 7
8 8
8 9
9 0
9 1
9 2
9 3
9 4
9 5
9 6
9 7
9 8
9 9
9 10
9 11
9 12
9 15
9 16
9 17
9 18
9 19
9 20
9 21
9 22
9 23
9 24
9 25
9 26
9 27
9 28
9 29
9 30
9 31
9 32
9 33
9 34
9 35
9 36
9 37
9 38
9 39
9 40
9 41
9 42
9 43
9 44
11 0
14 0
14 1
16 0
16 1
16 2
16 3
16 4
17 0
17 1
17 2
18 0
18 1
20 0
20 1
20 2
20 3
20 4
20 5
20 6
20 7
20 8
20 9
20 10
20 11
20 12
20 13
20 14
20 15
20 16
20 17
20 18
20 19
20 20
20 21
20 22
23 0
23 1
23 2
23 3
23 4
23 5
23 6
23 7
23 8
23 9
23 10
23 11
23 12
23 13
27 0
27 1
27 2
27 3
27 4
27 5
27 6
27 7
27 8
27 9
27 10
27 11
27 12
27 13
27 14
28 0
28 1
28 2
28 3
28 4
28 5
28 6
31 0
31 1
31 2
31 3
31 4
31 5
31 6
31 7
31 8
31 9
31 10
31 11
31 12
31 13
31 14
31 15
31 16
31 17
31 18
31 19
31 20
31 21
31 22
31 23
31 24
31 25
31 26
31 27
31 28
31 29
31 30
31 31
31 32
31 33
31 34
31 35
31 36
31 37
31 38
31 39
31 40
31 41
31 4

In [36]:
def process_issues(first_run: bool, manual_issue: bool, issues):
    """
    This function outputs a dataframe with a list of some the selected issues, all filtered 
    and formatted properly according to our schema:
    {
        repo:   # Repository the issue belongs to
        title:  # The title of the issue
        issue:  # The issue itself
        pr:     # The list of events in the pull request
        a:      # Title of the issue the events belong to
    }
    """
    
    filtered_issues = []
    counter = 0
    debug_counter = 0

    if not manual_issue:
        for indx, row in issues.iterrows():
            timeline = row["issue"][0].get_timeline()

            num_pr = 0
            list_pr = []
            list_a = []

            # Look for the cross-referenced pull request of the issue
            for event in timeline:
                if event.event == "cross-referenced" and event.source.issue.pull_request and event.source.issue.pull_request != None:
                    print("Got in!")
                    list_pr.append({"pr": event.source.issue, "event": event})
                    list_a.append({"a": repo["name"]})
                    num_pr += 1
                if num_pr > 1:
                    debug_counter -= num_pr
                    break

            if num_pr == 1:
                filtered_issues.append({
                    "repo": row["repo"],
                    "title": row["name"],
                    "issue": row["issue"],
                    "pr": list_pr,
                    "a": list_a
                })
                counter += 1
                print(counter)

            if counter > 99:
                break
        
        filtered_issues_dataframe = pd.DataFrame(filtered_issues)
        filtered_issues_dataframe.to_csv(path_or_buf="data/filtered_issues.csv")
    
        return filtered_issues_dataframe

filtered_issues = process_issues(True, manual_issue, selected_issues)

In [None]:
def get_coding_changes(first_run: bool, manual_issue: bool, data):
    if not manual_issue:
        updated_data = []

        return updated_data_dataframe

updated_data_dataframe = get_coding_changes(first_run, manual_issue, filtered_issues)

### Creating the Courses' File Structure

JetBrains have a specific file structure which allows us to generate courses just by following it. The key part of the JetBrains Academy course structure are the yaml files that define what this specific folder should do. There are 3 types a folder can be, with some types having subtypes:

- **Course:** The folder is encompassing the entire course. In there we define what the content of the course is and what the additional files are.
- **Lessons:** 
    - *Lesson*: Normal lessons where each task is independent from each other
    - *Guided Project*: Lesson with the `type: framework` tag, propagating the file changes made by the user from one task to the other
- **Tasks**:
    -**

### Create Course Input Format

The format of the create_course function is a json file with the following structure.

```text
{
    "title": "Course Title",
    "lessons": [
        {
            "title": "Lesson Title",
            "type": "edu or nothing",
            "tasks": [
                {
                    "title": "Task Title",
                    "type": "type of the task",
                    "description": "Generated Description of the Task"
                }
            ]
        }
    ],
    "additional content"
}
```

In [None]:
def create_course(course_data):
    """
    This function takes a JSON file as an input, and uses this input to create the entire folder structure
    for the JetBrains Academy course
    """

    