<a href="https://colab.research.google.com/github/Farooqbasha008/Github-Repository-Analyer---An-Automated-Github-analysis-tool/blob/main/Github_Automated_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests
!pip install PyGithub
!pip install pygments
!pip install transformers
!pip install openai
!pip install textwrap

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyGithub
  Downloading PyGithub-1.58.2-py3-none-any.whl (312 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.5/312.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting deprecated (from PyGithub)
  Downloading Deprecated-1.2.14-py2.py3-none-any.whl (9.6 kB)
Collecting pyjwt[crypto]>=2.4.0 (from PyGithub)
  Downloading PyJWT-2.7.0-py3-none-any.whl (22 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyjwt, deprecated, pynacl, PyGithub
Successfully installed PyGithub-1.58.2 deprecated-1.2.14

# 2

In [20]:
import re
import requests
from github import Github
from github.GithubException import GithubException, UnknownObjectException
import nbformat
from pygments import lex
from pygments.lexers.python import PythonLexer
from pygments.token import Token
import openai
from collections import defaultdict
import textwrap
from IPython.display import display, Markdown , HTML


# Set the maximum number of tokens allowed for GPT input
MAX_TOKENS = 4096

# Cache to store previously generated evaluations
evaluation_cache = {}


def get_user_repositories(github_user_url):
    # Extract the username from the GitHub user URL
    match = re.search(r"github\.com/([A-Za-z0-9_-]+)", github_user_url)
    if match:
        username = match.group(1)
    else:
        raise ValueError("Invalid GitHub URL")

    # Use the GitHub API to retrieve the user's repositories
    g = Github()  # Initialize the GitHub API client
    user = g.get_user(username)
    repositories = user.get_repos()

    return repositories


def preprocess_code(code):
    # Remove comments from the code
    code = re.sub(r"#.*", "", code)

    # Tokenize the code using pygments
    tokens = list(lex(code, PythonLexer()))

    # Create a list to store the tokenized code
    tokenized_code = []

    # Traverse the tokens and extract the token values
    for token in tokens:
        token_value = token[1]

        # Skip tokens with empty value or only containing whitespace
        if not token_value.strip():
            continue

        # Shorten long identifiers
        if len(token_value) > 20:
            token_value = token_value[:17] + "..."

        # Append the token value to the list
        tokenized_code.append(token_value)

    # Truncate or summarize code snippets that exceed the maximum token limit
    if len(tokenized_code) > MAX_TOKENS:
        tokenized_code = tokenized_code[:MAX_TOKENS]
        tokenized_code.append("...")  # Add ellipsis to indicate summary

    # Join the tokens back into code
    preprocessed_code = " ".join(tokenized_code)

    return preprocessed_code


def fetch_code_snippets(repository):
    code_snippets = []

    try:
        # Use the GitHub API to retrieve the contents of each file in the repository
        contents = repository.get_contents("")

        combined_code = ""

        for content_file in contents:
            # Fetch only Python files and Jupyter notebooks
            if content_file.path.endswith(".py"):
                # Fetch the code snippet from the file
                code_url = content_file.download_url
                response = requests.get(code_url)
                code = response.text

                # Preprocess the code
                preprocessed_code = preprocess_code(code)
                combined_code += preprocessed_code + " "

            elif content_file.path.endswith(".ipynb"):
                # Fetch the code cells from the Jupyter notebook
                code_url = content_file.download_url
                response = requests.get(code_url)
                nb = nbformat.reads(response.text, nbformat.NO_CONVERT)
                code_cells = [cell.source for cell in nb.cells if cell.cell_type == "code"]

                # Preprocess each code cell
                for code_cell in code_cells:
                    preprocessed_code = preprocess_code(code_cell)
                    combined_code += preprocessed_code + " "

        if not combined_code:
            raise UnknownObjectException(status=404, data="No code snippets found", headers={})  # Raise exception if no code snippets found

        # Split the combined code into chunks if it exceeds the maximum token limit
        code_chunks = [combined_code[i:i + MAX_TOKENS] for i in range(0, len(combined_code), MAX_TOKENS)]

        for i, chunk in enumerate(code_chunks):
            code_snippets.append((repository.name, f"chunk_{i + 1}", chunk, len(chunk)))

    except UnknownObjectException:
        # Skip repositories without code snippets
        return []

    return code_snippets


def evaluate_code_complexity(code):
    # Generate the evaluation using GPT-3.5 Turbo
    prompt = f"Code: {code}\n\nEvaluate the complexity of the code snippet based on the following criteria:\n\n" \
             "1. Code Length: Longer code tends to be more complex as it may contain more logic, branches, and dependencies.\n" \
             "2. Control Flow: The complexity increases with the presence of nested loops, conditional statements, and complex branching logic.\n" \
             "3. Function and Class Complexity: Functions or methods with a high number of lines, parameters, or local variables can be harder to understand and maintain.\n" \
             "4. Code Duplication: Repeated code blocks increase complexity and make maintenance more difficult. Identifying and removing code duplication is essential.\n" \
             "5. Code Coupling: High coupling, where modules or components depend heavily on each other, increases complexity. Lower coupling and better modularization lead to simpler code.\n" \
             "6. Code Dependencies: Complex dependencies between modules or libraries can make code harder to understand, test, and maintain. Minimizing dependencies and using clear interfaces can help manage complexity.\n" \
             "7. Code Documentation: The availability and quality of comments, documentation, and inline explanations impact code complexity. Well-documented code is generally easier to comprehend.\n" \
             "8. Naming Conventions: Meaningful and consistent naming of variables, functions, and classes improves code readability and reduces complexity.\n" \
             "9. Error Handling: Proper error handling, exception handling, and defensive programming techniques can help manage complexity when dealing with unexpected scenarios.\n" \
             "10. Code Readability: Clear formatting, indentation, and code style guidelines contribute to code readability and reduce complexity.\n\n" \
             "Please provide a complexity score between 0.1 and 1 for the code snippet based on the provided criteria."

    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        max_tokens=128,
        n=1,
        stop=None,
        temperature=0.7,
        top_p=1.0,
        frequency_penalty=0.0,
        presence_penalty=0.0,
    )

    # Extract the complexity score from the response
    complexity_score_text = response.choices[0].text.strip()
    complexity_score_matches = re.findall(r"\d+(\.\d+)?", complexity_score_text)
    complexity_score = float(complexity_score_matches[0])

    return complexity_score


def evaluate_repository_complexity(repository):
    code_snippets = fetch_code_snippets(repository)
    complexity_scores = []

    for snippet in code_snippets:
        code = snippet[2]

        # Check if evaluation is already cached
        if code in evaluation_cache:
            complexity_score = evaluation_cache[code]
        else:
            complexity_score = evaluate_code_complexity(code)
            # Cache the evaluation for future use
            evaluation_cache[code] = complexity_score

        complexity_scores.append(complexity_score)

    # Calculate the average complexity score for the repository
    if len(complexity_scores) > 0:
        repository_complexity = (sum(complexity_scores) / len(complexity_scores))
    else:
        repository_complexity = 0

    return repository_complexity


def analyze_github_user(github_user_url):
    repositories = get_user_repositories(github_user_url)
    repository_scores = {}

    for repository in repositories:
        if repository.size == 0:
            print(f"Skipping empty repository: {repository.name}")
            continue

        repository_name = repository.name
        repository_scores[repository_name] = evaluate_repository_complexity(repository)

    # Arrange the repositories in descending order of complexity
    sorted_repositories = sorted(repository_scores.items(), key=lambda x: x[1], reverse=True)

    # Display the repository names and complexity scores
    print("\nRepository Complexity Scores:")
    for repo, score in sorted_repositories:
        print(f"Repository: {repo}, Complexity Score: {score}")

    # Display the name and complexity score of the most complex repository
    if sorted_repositories:
        most_complex_repo = sorted_repositories[0]
        print("\nMost Complex Repository:")
        print(f"Repository: {most_complex_repo[0]}, Complexity Score: {most_complex_repo[1]}")

        # Use GPT to justify the selection of the most complex repository
        justification_prompt = f"Justify the selection of the most complex repository: {most_complex_repo[0]}."
        response = openai.Completion.create(
            engine="text-davinci-003",
            prompt=justification_prompt,
            max_tokens=128,
            n=1,
            stop=None,
            temperature=0.7,
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=0.0,
        )
        justification = response.choices[0].text.strip()

        formatted_justification = "\n".join(textwrap.wrap(justification, width=125))
        print("\nJustification by the GPT:")
        print(formatted_justification)

        # Get the URL of the most complex repository
        most_complex_repo_url = f"{github_user_url}/{most_complex_repo[0]}"

        # Print the hyperlink to the most complex repository
        display(Markdown(f"[Click here to go to the Most Complex Repository of the user]({most_complex_repo_url})"))

    else:
        print("No repositories with code snippets found.")


# Usage
github_user_url = input("Enter the GitHub user URL: ")
openai.api_key = input("Enter your OpenAI API key: ")

analyze_github_user(github_user_url)


SyntaxError: ignored

In [21]:

# HTML structure for the user interface
html = f'''
<!DOCTYPE html>
<html>
<head>
  <link rel="stylesheet" type="text/css" href="styles.css">
</head>
<body>
  <div class="container">
    <h1 class="title">GitHub Repository Analyzer</h1>
    <div class="input-field">
      <label for="github-url">Enter the GitHub user's URL:</label>
      <input id="github-url" type="text">
    </div>
    <div class="input-field">
      <label for="openai-api-key">Enter your OpenAI API key:</label>
      <input id="openai-api-key" type="text">
    </div>
    <button class="button" onclick="analyzeUser()">Analyze</button>
    <div class="results">
      <h2>Repository Complexity Scores:</h2>
      <ul id="complexity-scores"></ul>
      <h2>Most Complex Repository:</h2>
      <p id="most-complex-repo"></p>
      <h2>Justification by the GPT:</h2>
      <p id="justification"></p>
      <h2>Most Complex Repository URL:</h2>
      <p id="repo-url"></p>
    </div>
  </div>

  <script>
    function analyzeUser() {{
      var githubUrl = document.getElementById('github-url').value;
      var openaiApiKey = document.getElementById('openai-api-key').value;
      fetch(`/analyze?github_url=${{encodeURIComponent(githubUrl)}}`)
        .then(response => response.json())
        .then(data => displayResults(data));
    }}

    function displayResults(data) {{
      var scoresList = document.getElementById('complexity-scores');
      scoresList.innerHTML = '';
      data.repository_scores.forEach(score => {{
        var listItem = document.createElement('li');
        listItem.innerText = `Repository: ${{score.repository}}, Complexity Score: ${{score.score}}`;
        scoresList.appendChild(listItem);
      }});

      var mostComplexRepo = document.getElementById('most-complex-repo');
      mostComplexRepo.innerText = `Repository: ${{data.most_complex_repository.repository}}, Complexity Score: ${{data.most_complex_repository.score}}`;

      var justification = document.getElementById('justification');
      justification.innerText = data.justification;

      var repoUrl = document.getElementById('repo-url');
      repoUrl.innerHTML = `<a href="${{data.most_complex_repository_url}}" target="_blank">${{data.most_complex_repository_url}}</a>`;
    }}
  </script>
</body>
</html>
'''

display(HTML(html))
