In [1]:
import requests
import csv
import os
import shutil
import git
import ast
import re
import subprocess
import pandas as pd


Order of a repository line : 
___________
'Name', 'Description', 'URL', 'Stars', 'Forks', 'Lines of Code'

READ AND WRITE CSV


In [2]:
def save_to_csv(repositories, filename):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        fieldnames = ["name", "html_url", "lines_of_code"]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        writer.writeheader()
        for repo in repositories:
            writer.writerow({
                "name": repo["name"],
                "html_url": repo["html_url"],
                "lines_of_code": get_lines_of_code(repo)
            })

def load_from_csv(filename):
    repositories = []
    with open(filename, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            repositories.append({
                "name": row["name"],
                "html_url": row["html_url"],
                "lines_of_code": int(row["lines_of_code"])
            })
    return repositories

def get_lines_of_code(repo):
    return repo["size"]

Peruse github

In [3]:
def search_repositories(query, min_lines_of_code, token=None):
    headers = {"Authorization": f"Bearer {token}"} if token else {}

    # Set up the GitHub search API URL
    search_url = "https://api.github.com/search/repositories"
    params = {"q": query, "sort": "stars", "order": "desc"}

    # Make the API request
    response = requests.get(search_url, params=params, headers=headers)

    if response.status_code == 200:
        repositories = response.json()["items"]
        filtered_repos = [repo for repo in repositories if get_lines_of_code(repo) > min_lines_of_code]
        return filtered_repos
    else:
        print(f"Error: {response.status_code}")
        return []

Cloning And word search


In [4]:

def count_classes_with_keyword(file_path, keyword):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    tree = ast.parse(content)
    
    class_count = 0
    
    for node in ast.walk(tree):
        if isinstance(node, ast.ClassDef):
            if keyword.lower() in node.name.lower():
                class_count += 1
    
    return class_count

def clone_and_analyze_repositories(repositories, keyword):
    result = []

    for repo in repositories:
        repo_name = repo['Name']
        repo_url = repo['URL']
        repo_path = f"repos/{repo_name}"

        try:
            # Clone the repository
            git.Repo.clone_from(repo_url, repo_path)

            # Analyze the code
            total_classes_with_keyword = 0

            for root, dirs, files in os.walk(repo_path):
                for file in files:
                    if file.endswith('.py'):
                        file_path = os.path.join(root, file)
                        total_classes_with_keyword += count_classes_with_keyword(file_path, keyword)

            result.append({'Name': repo_name, 'ClassesWithKeyword': total_classes_with_keyword})

        except Exception as e:
            print(f"Error analyzing repository '{repo_name}': {e}")
        finally:
            None
            # Clean up: Remove the cloned repository
            # if os.path.exists(repo_path):
                # shutil.rmtree(repo_path)

    return result

def count_keyword_occurrences(repo, keyword):
    occurrences = 0
    # Get the repository's contents
    contents_url = repo["contents_url"].replace("{+path}", "")
    response = requests.get(contents_url)
    
    if response.status_code == 200:
        files = response.json()
        for file in files:
            if file["type"] == "file" and file["name"].endswith(".py"):
                # Download the Python file
                file_url = file["download_url"]
                file_content = requests.get(file_url).text

                # Count occurrences of the keyword using regular expressions
                occurrences += len(re.findall(fr'\b{keyword}\b', file_content))
    
    return occurrences

def count_word_all_occurrences(repo, word, token=None):
    occurrences = 0
    headers = {"Authorization": f"Bearer {token}"} if token else {}

    # Get the repository's contents
    contents_url = repo["contents_url"].replace("{+path}", "")
    response = requests.get(contents_url, headers=headers)

    if response.status_code == 200:
        files = response.json()
        for file in files:
            if file["type"] == "file":
                # Download the file
                file_url = file["download_url"]
                file_content = requests.get(file_url, headers=headers).text

                # Count occurrences of the word using regular expressions
                occurrences += len(re.findall(fr'\b{word}\b', file_content, flags=re.IGNORECASE))
    
    return occurrences

In [58]:
def clone_repo(repo):
        repo_url = repo['html_url'] + '.git'
        local_repo_path = os.path.join(os.getcwd() + '/repo', repo['name'])
        subprocess.run(['git', 'clone', repo_url, local_repo_path])
        return local_repo_path

def count_word_occurrences(local_repo_path, word):
    occurrences = 0

    # Iterate through all files in the local repository
    for root, dirs, files in os.walk(local_repo_path):
        for file in files:
            file_path = os.path.join(root, file)

            # Read the file content and count occurrences of the word
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                file_content = f.read()
                occurrences += len(re.findall(fr'\b{word}\b', file_content, flags=re.IGNORECASE))
    
    return occurrences

In [37]:
api_url = 'https://api.github.com/search/repositories'
techniques = []
metrics = []
csv_filename = 'repositories_py.csv'

queries = ['customer', 'devops', 'operation', 'service', 'satisfaction', 'feedback']
min_lines_of_code = 5000
token = None  # Replace with your GitHub token (optional)

repositories = []
for query in queries:
    repositories.extend(search_repositories(query, min_lines_of_code, token))


[79, 106, 65]
[{'id': 202291859, 'node_id': 'MDEwOlJlcG9zaXRvcnkyMDIyOTE4NTk=', 'name': 'chatwoot', 'full_name': 'chatwoot/chatwoot', 'private': False, 'owner': {'login': 'chatwoot', 'id': 23416667, 'node_id': 'MDEyOk9yZ2FuaXphdGlvbjIzNDE2NjY3', 'avatar_url': 'https://avatars.githubusercontent.com/u/23416667?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/chatwoot', 'html_url': 'https://github.com/chatwoot', 'followers_url': 'https://api.github.com/users/chatwoot/followers', 'following_url': 'https://api.github.com/users/chatwoot/following{/other_user}', 'gists_url': 'https://api.github.com/users/chatwoot/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/chatwoot/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/chatwoot/subscriptions', 'organizations_url': 'https://api.github.com/users/chatwoot/orgs', 'repos_url': 'https://api.github.com/users/chatwoot/repos', 'events_url': 'https://api.github.com/users/chatwoot/events{/privacy}', '

In [39]:
to_remove = []
for i in range(len(repositories)):
    for j in range(i+1, len(repositories)):
        if repositories[i]['name'] == repositories[j]['name']:
            to_remove.append(j)

print(to_remove)

for i in range(len(to_remove)):
    repositories.pop(to_remove[i] - i)

[]


In [43]:
save_to_csv(repositories, csv_filename)

print(repositories)

[{'id': 202291859, 'node_id': 'MDEwOlJlcG9zaXRvcnkyMDIyOTE4NTk=', 'name': 'chatwoot', 'full_name': 'chatwoot/chatwoot', 'private': False, 'owner': {'login': 'chatwoot', 'id': 23416667, 'node_id': 'MDEyOk9yZ2FuaXphdGlvbjIzNDE2NjY3', 'avatar_url': 'https://avatars.githubusercontent.com/u/23416667?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/chatwoot', 'html_url': 'https://github.com/chatwoot', 'followers_url': 'https://api.github.com/users/chatwoot/followers', 'following_url': 'https://api.github.com/users/chatwoot/following{/other_user}', 'gists_url': 'https://api.github.com/users/chatwoot/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/chatwoot/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/chatwoot/subscriptions', 'organizations_url': 'https://api.github.com/users/chatwoot/orgs', 'repos_url': 'https://api.github.com/users/chatwoot/repos', 'events_url': 'https://api.github.com/users/chatwoot/events{/privacy}', 'received_event

In [67]:
names = map(lambda x : x["name"], repositories)
techniques = ['bases testing', 'interview', 'questionnaire', 'survey', 'observation','theatre', 'prototype', 'incident report', 'developper as customers', 'customer pairing', 'walk through', 'ads', 'beta testing', 'operational data', 'event data', 'a/b testing', 'social network', 'crowd-funding']
metrics = ['acquisition', 'activation', 'retention', 'referral', 'revenue', 'net promoter score', 'customer effort score', 'customer satisfaction score']

metric_res = {'metrics': metrics}
techniques_res = {'techniques': techniques}

for repo in repositories:
    name = repo["name"]
    # metric_res[name] = []
    techniques_res[name] = []

    local_repo = clone_repo(repo)

    # metric_res[name] = count_word_occurrences(local_repo, metrics)
    # techniques_res[name] = count_word_occurrences(local_repo, techniques)

    # for metric in metrics:
        # metric_res[name].append(count_word_occurrences(local_repo, metric))

    for t in techniques:
        techniques_res[name].append(count_word_occurrences(local_repo, t))

technique_df = pd.DataFrame.from_dict(techniques_res)
metric_df = pd.DataFrame.from_dict(metric_res)

In [63]:
technique_df = pd.DataFrame.from_dict(techniques_res)
metric_df = pd.DataFrame.from_dict(metric_res)

In [64]:
technique_df.reset_index(inplace=True)
metric_df.reset_index(inplace=True)

technique_df.set_index('techniques', inplace=True)
metric_df.set_index('metrics', inplace=True)

print(technique_df)
print(metric_df)

technique_df.to_csv('techniques.csv')
metric_df.to_csv('metrics.csv')

                         index  chatwoot  liquid  hazelcast   twenty  \
techniques                                                             
bases testing                0   7779528  315970   23728292  3745517   
interview                    1   7779528  315970   23728292  3745517   
questionnaire                2   7779528  315970   23728292  3745517   
survey                       3   7779528  315970   23728292  3745517   
observation                  4   7779528  315970   23728292  3745517   
theatre                      5   7779528  315970   23728292  3745517   
prototype                    6   7779528  315970   23728292  3745517   
incident report              7   7779528  315970   23728292  3745517   
developper as customers      8   7779528  315970   23728292  3745517   
customer pairing             9   7779528  315970   23728292  3745517   
walk through                10   7779528  315970   23728292  3745517   
ads                         11   7779528  315970   23728292  374

In [65]:
def apply_normalisation(df, repos):
    normalized_df = df.copy()
    for repo in repos:
        name = repo["name"]
        normalized_df[name] = (df[name] / repo['size']) * 1000
    return normalized_df

def sort_and_remove_zeros(row):
    sorted_row = sorted(row, key=lambda x: x, reverse=True)
    return list(filter(lambda x: x > 0, sorted_row))

def find_non_zero_rows(row):
    return row.astype(bool).sum()

print(apply_normalisation(technique_df, repositories))

non_zero_technique = technique_df.apply(find_non_zero_rows, axis=1)
non_zero_metrics = metric_df.apply(find_non_zero_rows, axis=1)



                         index      chatwoot        liquid     hazelcast  \
techniques                                                                 
bases testing                0  56708.299012  55844.821492  59597.710359   
interview                    1  56708.299012  55844.821492  59597.710359   
questionnaire                2  56708.299012  55844.821492  59597.710359   
survey                       3  56708.299012  55844.821492  59597.710359   
observation                  4  56708.299012  55844.821492  59597.710359   
theatre                      5  56708.299012  55844.821492  59597.710359   
prototype                    6  56708.299012  55844.821492  59597.710359   
incident report              7  56708.299012  55844.821492  59597.710359   
developper as customers      8  56708.299012  55844.821492  59597.710359   
customer pairing             9  56708.299012  55844.821492  59597.710359   
walk through                10  56708.299012  55844.821492  59597.710359   
ads         

In [66]:
print((non_zero_metrics-1) / 111 * 100)


metrics
acquisition                    18.018018
activation                     39.639640
retention                      40.540541
referral                       20.720721
revenue                        16.216216
net promoter score              0.000000
customer effort score           0.000000
customer satisfaction score     0.900901
dtype: float64
