In [41]:
import requests
import csv
import os
import shutil
import git
import ast
import re
import subprocess
import pandas as pd


Order of a repository line : 
___________
'Name', 'Description', 'URL', 'Stars', 'Forks', 'Lines of Code'

READ AND WRITE CSV


In [42]:
def save_to_csv(repositories, filename):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        fieldnames = ["name", "html_url", "lines_of_code"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        writer.writeheader()
        for repo in repositories:
            writer.writerow({
                "name": repo["name"],
                "html_url": repo["html_url"],
                "lines_of_code": get_lines_of_code(repo)
            })

def load_from_csv(filename):
    repositories = []
    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            repositories.append({
                "name": row["name"],
                "html_url": row["html_url"],
                "lines_of_code": int(row["lines_of_code"])
            })
    return repositories

def get_lines_of_code(repo):
    return repo["size"]

Peruse github

In [43]:
def search_repositories(query, min_lines_of_code, token=None):
    headers = {"Authorization": f"Bearer {token}"} if token else {}

    # Set up the GitHub search API URL
    search_url = "https://api.github.com/search/repositories"
    params = {"q": query, "sort": "stars", "order": "desc"}

    # Make the API request
    response = requests.get(search_url, params=params, headers=headers)

    if response.status_code == 200:
        repositories = response.json()["items"]
        filtered_repos = [repo for repo in repositories if get_lines_of_code(repo) > min_lines_of_code]
        return filtered_repos
    else:
        print(f"Error: {response.status_code}")
        return []

Cloning And word search


In [44]:

def count_classes_with_keyword(file_path, keyword):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    tree = ast.parse(content)
    
    class_count = 0
    
    for node in ast.walk(tree):
        if isinstance(node, ast.ClassDef):
            if keyword.lower() in node.name.lower():
                class_count += 1
    
    return class_count

def clone_and_analyze_repositories(repositories, keyword):
    result = []

    for repo in repositories:
        repo_name = repo['Name']
        repo_url = repo['URL']
        repo_path = f"repos/{repo_name}"

        try:
            # Clone the repository
            git.Repo.clone_from(repo_url, repo_path)

            # Analyze the code
            total_classes_with_keyword = 0

            for root, dirs, files in os.walk(repo_path):
                for file in files:
                    if file.endswith('.py'):
                        file_path = os.path.join(root, file)
                        total_classes_with_keyword += count_classes_with_keyword(file_path, keyword)

            result.append({'Name': repo_name, 'ClassesWithKeyword': total_classes_with_keyword})

        except Exception as e:
            print(f"Error analyzing repository '{repo_name}': {e}")
        finally:
            None
            # Clean up: Remove the cloned repository
            # if os.path.exists(repo_path):
                # shutil.rmtree(repo_path)

    return result

def count_keyword_occurrences(repo, keyword):
    occurrences = 0
    # Get the repository's contents
    contents_url = repo["contents_url"].replace("{+path}", "")
    response = requests.get(contents_url)
    
    if response.status_code == 200:
        files = response.json()
        for file in files:
            if file["type"] == "file" and file["name"].endswith(".py"):
                # Download the Python file
                file_url = file["download_url"]
                file_content = requests.get(file_url).text

                # Count occurrences of the keyword using regular expressions
                occurrences += len(re.findall(fr'\b{keyword}\b', file_content))
    
    return occurrences

def count_word_all_occurrences(repo, word, token=None):
    occurrences = 0
    headers = {"Authorization": f"Bearer {token}"} if token else {}

    # Get the repository's contents
    contents_url = repo["contents_url"].replace("{+path}", "")
    response = requests.get(contents_url, headers=headers)

    if response.status_code == 200:
        files = response.json()
        for file in files:
            if file["type"] == "file":
                # Download the file
                file_url = file["download_url"]
                file_content = requests.get(file_url, headers=headers).text

                # Count occurrences of the word using regular expressions
                occurrences += len(re.findall(fr'\b{word}\b', file_content, flags=re.IGNORECASE))
    
    return occurrences

In [45]:
def clone_repo(repo):
        repo_url = repo['html_url'] + '.git'
        local_repo_path = os.path.join(os.getcwd(), repo['name'])
        subprocess.run(['git', 'clone', repo_url, local_repo_path])
        return local_repo_path

def count_word_occurrences(local_repo_path, word):
    occurrences = 0

    # Iterate through all files in the local repository
    for root, dirs, files in os.walk(local_repo_path):
        for file in files:
            file_path = os.path.join(root, file)

            # Read the file content and count occurrences of the word
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                file_content = f.read()
                occurrences += len(re.findall(fr'\b{word}\b', file_content, flags=re.IGNORECASE))
    
    return occurrences

In [50]:
api_url = 'https://api.github.com/search/repositories'
techniques = []
metrics = []
csv_filename = 'repositories_py.csv'

query = 'devops' #"devops language:python"
min_lines_of_code = 10000
token = None  # Replace with your GitHub token (optional)

repositories = search_repositories(query, min_lines_of_code, token)
save_to_csv(repositories, csv_filename)



In [47]:
names = map(lambda x : x["name"], repositories)
techniques = ['testing', 'interview', 'questionnaire', 'survey', 'observation','theatre', 'prototype', 'incident report', 'developper as customers', 'customer pairing', 'walkthrough', 'ads', 'beta', 'operational', 'a/b', 'social network', 'crowd-funding']
metrics = ['acquisition', 'activation', 'retention', 'referal', 'revenue']

metric_res = {'metrics': metrics}
techniques_res = {'techniques': techniques}

for repo in repositories:
    name = repo["name"]
    metric_res[name] = []
    techniques_res[name] = []
    local_repo = clone_repo(repo)

    for metric in metrics:
        metric_res[name].append(count_word_occurrences(local, metric))

    for t in techniques:
        techniques_res[name].append(count_word_occurrences(local, t))

technique_df = pd.DataFrame.from_dict(techniques_res)
metric_df = pd.DataFrame.from_dict(metric_res)

In [48]:
print(technique_df)


       metrics  free-for-dev  netdata  act  gitea  kong  sentry  cli  dokku  \
0  acquisition             0        0    0      0     0       0    0      0   
1   activation             0        0    0      0     0       0    0      0   
2    retention             0        0    0      0     0       0    0      0   
3      referal             0        0    0      0     0       0    0      0   
4      revenue             0        0    0      0     0       0    0      0   

   90DaysOfDevOps  ...  watchtower  wtf  argo-cd  sops  kubesphere  apisix  \
0               0  ...           0    0        0     0           0       0   
1               0  ...           0    0        0     0           0       0   
2               0  ...           0    0        0     0           0       0   
3               0  ...           0    0        0     0           0       0   
4               0  ...           0    0        0     0           0       0   

   onedev  walle-web  lynis  kubeshark  
0       0      