# GitHub Issues Retriever

This notebook demonstrates how to retrieve issues from a specific GitHub repository using the GitHub API. The issues will be filtered with the criteria `is:issue is:closed label:bug linked:pr`

In [None]:
!pip install langchain
!pip install openpyxl
!pip install openai
!pip install langchain-openai

In [9]:
# Import necessary libraries
import requests
import os
from typing import Dict, Any, Set, Tuple
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.messages import SystemMessage, HumanMessage
from openpyxl import Workbook, load_workbook

In [None]:
GITHUB_TOKEN = env.get('GITHUB_TOKEN')
GITHUB_API_URL = 'https://api.github.com'
OPENAI_API_KEY = env.get('OPENAI_API_KEY')
ANALYZED_ISSUES_FILE ='analyzed_issues.xlsx'

In [11]:
class AnalyzedIssuesStorage:
    def __init__(self, filename=ANALYZED_ISSUES_FILE):
        self.filename = filename
        self.analyzed_issues = {}
        self.visited_pages = {}
        self._load()

    def _load(self):
        if not os.path.exists(self.filename):
            workbook = Workbook()
            sheet = workbook.active
            sheet.title = 'Analyzed Issues'
            sheet.append(['issue_id', 'repo_owner', 'repo_name', 'issue_description', 'issue_prompt', 
                         'issue_url', 'issue_state', 'issue_files', 'issue_pr_url'])
            workbook.save(self.filename)
            return
        
        try:
            workbook = load_workbook(self.filename)
            sheet = workbook.active
            
            self.analyzed_issues = {}
            for row in sheet.iter_rows(min_row=2, values_only=True):
                if not row[0]:  
                    continue
                
                issue_id, owner, repo = row[0], row[1], row[2]
                if owner not in self.analyzed_issues:
                    self.analyzed_issues[owner] = {}
                if repo not in self.analyzed_issues[owner]:
                    self.analyzed_issues[owner][repo] = []
                self.analyzed_issues[owner][repo].append(str(issue_id))
            
            self.visited_pages = {}
            visited_pages_sheet = None
            
            if 'Visited Pages' in workbook.sheetnames:
                visited_pages_sheet = workbook['Visited Pages']
            else:
                visited_pages_sheet = workbook.create_sheet('Visited Pages')
                visited_pages_sheet.append(['repo_owner', 'repo_name', 'page_number'])
                workbook.save(self.filename)
            
            for row in visited_pages_sheet.iter_rows(min_row=2, values_only=True):
                if not row[0]:
                    continue
                
                owner, repo, page = row[0], row[1], row[2]
                if owner not in self.visited_pages:
                    self.visited_pages[owner] = {}
                if repo not in self.visited_pages[owner]:
                    self.visited_pages[owner][repo] = set()
                self.visited_pages[owner][repo].add(int(page))
                
        except Exception as e:
            print(f"Error reading {self.filename}: {str(e)}")
            self.analyzed_issues = {}
            self.visited_pages = {}
    
    def save_issue(self, owner: str, repo: str, issue_id: int, description: str, prompt: str, url: str, pr_url: str = "void"):
        if owner not in self.analyzed_issues:
            self.analyzed_issues[owner] = {}
        if repo not in self.analyzed_issues[owner]:
            self.analyzed_issues[owner][repo] = []
        
        if str(issue_id) in self.analyzed_issues[owner][repo]:
            return False
        
        self.analyzed_issues[owner][repo].append(str(issue_id))
        
        try:
            workbook = load_workbook(self.filename)
            sheet = workbook.active
            
            sheet.append([
                str(issue_id), 
                owner, 
                repo, 
                description, 
                prompt, 
                url, 
                "REVIEW",  
                "",        
                pr_url   
            ])
            
            workbook.save(self.filename)
            return True
        except Exception as e:
            print(f"Error saving to {self.filename}: {str(e)}")
            return False
    
    def is_analyzed(self, owner: str, repo: str, issue_id: int) -> bool:
        return str(issue_id) in self.analyzed_issues.get(owner, {}).get(repo, [])
    
    def mark_page_as_visited(self, owner: str, repo: str, page: int):
        if owner not in self.visited_pages:
            self.visited_pages[owner] = {}
        if repo not in self.visited_pages[owner]:
            self.visited_pages[owner][repo] = set()
        
        if page in self.visited_pages[owner][repo]:
            return
        
        self.visited_pages[owner][repo].add(page)
        
        try:
            workbook = load_workbook(self.filename)
            
            if 'Visited Pages' in workbook.sheetnames:
                visited_pages_sheet = workbook['Visited Pages']
            else:
                visited_pages_sheet = workbook.create_sheet('Visited Pages')
                visited_pages_sheet.append(['repo_owner', 'repo_name', 'page_number'])
            
            visited_pages_sheet.append([owner, repo, page])
            workbook.save(self.filename)
        except Exception as e:
            print(f"Error saving visited page to {self.filename}: {str(e)}")
    
    def is_page_visited(self, owner: str, repo: str, page: int) -> bool:
        return page in self.visited_pages.get(owner, {}).get(repo, set())
    
    def get_visited_pages(self, owner: str, repo: str) -> Set[int]:
        return self.visited_pages.get(owner, {}).get(repo, set())

storage = AnalyzedIssuesStorage()

In [12]:
def fetch_github_issues(owner: str, repo: str, page: int = 1, per_page: int = 10):
    if not GITHUB_TOKEN:
        raise ValueError("GITHUB_TOKEN environment variable not set")
        
    headers = {
        'Authorization': f'token {GITHUB_TOKEN}',
        'Accept': 'application/vnd.github.v3+json'
    }
    params = {
        'q': f'repo:{owner}/{repo} is:issue is:closed linked:pr',
        'page': page,
        'per_page': per_page
    }
    response = requests.get(f'{GITHUB_API_URL}/search/issues', headers=headers, params=params)
    response.raise_for_status()
    return response.json()

In [13]:
def generate_issue_description_and_prompt(issue_data: Dict[str, Any]) -> Tuple[str, str]:
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0, api_key=OPENAI_API_KEY)
    
    title = issue_data.get('title', '')
    body = issue_data.get('body', '') or ''
    if len(body) > 1000: 
        body = body[:1000] + "..."
    
    system_prompt = """
    You are an expert in analyzing technical issues in code repositories. 
    Your task is to analyze GitHub issues and provide:

    1. A concise description (maximum 5 lines) summarizing the problem stated in the issue.
    2. A well-structured prompt (maximum 10 lines) that could be used to request a solution to this issue from a language model. This prompt should request: identification of the files where the problem is located and a concise explanation of the error.
        

    Respond in JSON format with the keys "description" and "prompt".
    """

    user_prompt = f"""
    Analyze the following GitHub issue:

    Title: {title}

    Description:
    {body}

    Generate:
    1. A concise description (maximum 5 lines) summarizing the problem.
    2. An optimized prompt (maximum 10 lines) to request a solution for this issue.
    """
    
    messages = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=user_prompt)
    ]
    
    output_parser = JsonOutputParser()
    
    try:
        response = llm.invoke(messages)
        parsed_response = output_parser.invoke(response.content)
        return parsed_response.get("description", ""), parsed_response.get("prompt", "")
    except Exception as e:
        print(f"Error generating description and prompt: {str(e)}")
        description = f"Issue related to: {title}"
        prompt = f"¿How to solve the problem '{title}' in a GitHub repository"
        return description, prompt

In [14]:
def get_linked_pr_url(issue_data: Dict[str, Any], owner: str, repo: str, headers: Dict[str, str]) -> str:
    """
    Gets the URL of the first pull request directly linked to an issue.
    Only retrieves PRs that are formally linked to the issue.
    
    Args:
        issue_data: Issue data obtained from the GitHub API
        owner: Repository owner
        repo: Repository name
        headers: Headers for GitHub API requests
        
    Returns:
        URL of the first linked PR or "void" if none exists
    """
    issue_number = issue_data.get('number')
    
    if "pull_request" in issue_data:
        return "void"
    
    try:
        # The most reliable way to find directly linked PRs is through the timeline API
        timeline_url = f'{GITHUB_API_URL}/repos/{owner}/{repo}/issues/{issue_number}/timeline'
        headers_with_preview = headers.copy()
        headers_with_preview['Accept'] = 'application/vnd.github.mockingbird-preview+json'
        
        response = requests.get(timeline_url, headers=headers_with_preview)
        if response.status_code == 200:
            timeline = response.json()
            
            for event in timeline:
                if event.get('event') == 'connected':
                    source = event.get('source', {})
                    if source.get('type') == 'pull_request' and source.get('issue', {}).get('pull_request'):
                        return source.get('issue', {}).get('html_url', "void")
                
                if event.get('event') == 'cross-referenced':
                    source = event.get('source', {})
                    if source.get('type') == 'pull_request' and source.get('issue', {}).get('pull_request'):
                        # Check if this PR has a closing reference to our issue
                        pr_number = source.get('issue', {}).get('number')
                        pr_url = f'{GITHUB_API_URL}/repos/{owner}/{repo}/pulls/{pr_number}'
                        
                        pr_response = requests.get(pr_url, headers=headers)
                        if pr_response.status_code == 200:
                            pr_data = pr_response.json()
                            body = pr_data.get('body', '') or ''
                            
                            # Check if this PR explicitly closes the 
                            closes_patterns = [
                                f"closes #{issue_number}",
                                f"fixes #{issue_number}",
                                f"resolves #{issue_number}"
                            ]
                            
                            for pattern in closes_patterns:
                                if pattern.lower() in body.lower():
                                    return source.get('issue', {}).get('html_url', "void")
            
            for event in timeline:
                if event.get('event') == 'closed' and event.get('commit_id'):
                    commit_sha = event.get('commit_id')
                    commit_url = f'{GITHUB_API_URL}/repos/{owner}/{repo}/commits/{commit_sha}'
                    
                    commit_response = requests.get(commit_url, headers=headers)
                    if commit_response.status_code == 200:
                        commit_data = commit_response.json()
                        commit_message = commit_data.get('commit', {}).get('message', '')
                        
                        if f"#{issue_number}" in commit_message:
                            pr_search_url = f'{GITHUB_API_URL}/search/issues'
                            pr_params = {
                                'q': f'repo:{owner}/{repo} is:pr {commit_sha}'
                            }
                            
                            pr_response = requests.get(pr_search_url, headers=headers, params=pr_params)
                            if pr_response.status_code == 200:
                                pr_data = pr_response.json()
                                if pr_data.get('total_count', 0) > 0:
                                    return pr_data.get('items', [])[0].get('html_url', "void")
        
        search_url = f'{GITHUB_API_URL}/search/issues'
        search_query = f'repo:{owner}/{repo} is:pr "Closes #{issue_number}"'
        params = {'q': search_query}
        
        search_response = requests.get(search_url, headers=headers, params=params)
        if search_response.status_code == 200:
            search_result = search_response.json()
            if search_result.get('total_count', 0) > 0:
                return search_result.get('items', [])[0].get('html_url', "void")
        
        for keyword in ["Fixes", "Resolves"]:
            search_query = f'repo:{owner}/{repo} is:pr "{keyword} #{issue_number}"'
            params = {'q': search_query}
            
            search_response = requests.get(search_url, headers=headers, params=params)
            if search_response.status_code == 200:
                search_result = search_response.json()
                if search_result.get('total_count', 0) > 0:
                    return search_result.get('items', [])[0].get('html_url', "void")
    
    except Exception as e:
        print(f"Error getting linked PR for issue #{issue_number}: {str(e)}")
    
    return "void"

In [15]:
def analyze_github_issues(owner: str, repo: str, target_issues: int, per_page: int = 10) -> int:
    """
    Main function to analyze GitHub issues.

    Args:
        owner: Repository owner
        repo: Repository name
        target_issues: Target number of issues to analyze
        per_page: Number of issues to retrieve per page

    Returns:
        Number of issues analyzed
    """
    analyzed_count = 0
    current_page = 1
    visited_pages = storage.get_visited_pages(owner, repo)
    
    if not GITHUB_TOKEN:
        raise ValueError("GITHUB_TOKEN environment variable not set")
        
    headers = {
        'Authorization': f'token {GITHUB_TOKEN}',
        'Accept': 'application/vnd.github.v3+json'
    }
    
    if visited_pages:
        current_page = max(visited_pages) + 1
    
    while analyzed_count < target_issues:
        if storage.is_page_visited(owner, repo, current_page):
            current_page += 1
            continue
        
        try:
            print(f"Fetching page {current_page} for {owner}/{repo}...")
            response_data = fetch_github_issues(owner, repo, current_page, per_page)
 
            storage.mark_page_as_visited(owner, repo, current_page)
 
            issues = response_data.get('items', [])
            
            if not issues:
                print(f"No more issues found for {owner}/{repo}")
                break
            
            for issue in issues:
                issue_number = issue.get('number')
                if not issue_number or storage.is_analyzed(owner, repo, issue_number):
                    continue
                           
                # description, prompt = generate_issue_description_and_prompt(issue)
                
                linked_pr_url = get_linked_pr_url(issue, owner, repo, headers)
                
                if linked_pr_url == "void":
                    print(f"Skipping issue #{issue_number} - No linked PR found")
                    continue
                else:
                    print(f"Linked PR URL for issue #{issue_number}: {linked_pr_url}")
                
                url = issue.get('html_url', '')
                saved = storage.save_issue(owner, repo, issue_number, description, prompt, url, linked_pr_url)
                
                if saved:
                    analyzed_count += 1
                    print(f"Analyzed issue #{issue_number} - Total: {analyzed_count}/{target_issues}")
                
                if analyzed_count >= target_issues:
                    break
            
            current_page += 1
            
        except Exception as e:
            print(f"Error processing page {current_page}: {str(e)}")
            current_page += 1
            
            # If we've had too many errors, better stop
            if current_page > 100:  # Arbitrary limit
                print("Too many errors, stopping")
                break
    
    return analyzed_count

In [16]:
def main():
    owner = 'reactjs' 
    repo = 'react-rails' 
    target_issues = 10
    per_page = 2

    try:
        if not GITHUB_TOKEN:
            print("ERROR: GITHUB_TOKEN environment variable not set")
            print("Please set it with: export GITHUB_TOKEN=your_token_here")
            return

        # Run the analysis
        analyzed_count = analyze_github_issues(owner, repo, target_issues, per_page)
        print(f"Successfully analyzed {analyzed_count} issues from {owner}/{repo}")
        
    except requests.exceptions.RequestException as e:
        print(f'Error: {str(e)}')

# Run the main function
if __name__ == "__main__":
    main()

Fetching page 1 for reactjs/react-rails...
Linked PR URL for issue #1311: https://github.com/reactjs/react-rails/pull/1312
Analyzed issue #1311 - Total: 1/10
Linked PR URL for issue #1276: https://github.com/reactjs/react-rails/pull/1278
Analyzed issue #1276 - Total: 2/10
Fetching page 2 for reactjs/react-rails...
Linked PR URL for issue #1258: https://github.com/reactjs/react-rails/pull/1269
Analyzed issue #1258 - Total: 3/10
Linked PR URL for issue #1249: https://github.com/reactjs/react-rails/pull/1268
Analyzed issue #1249 - Total: 4/10
Fetching page 3 for reactjs/react-rails...
Linked PR URL for issue #1211: https://github.com/reactjs/react-rails/pull/1214
Analyzed issue #1211 - Total: 5/10
Linked PR URL for issue #1195: https://github.com/reactjs/react-rails/pull/1218
Analyzed issue #1195 - Total: 6/10
Fetching page 4 for reactjs/react-rails...
Linked PR URL for issue #1193: https://github.com/reactjs/react-rails/pull/1198
Analyzed issue #1193 - Total: 7/10
Linked PR URL for issue