In [2]:
!pip install pydriller
import csv
from pydriller import Repository
from pydriller.metrics.process.lines_count import LinesCount
from collections import defaultdict
import requests
import lxml.html as lx
from datetime import datetime, timedelta
import time
import pandas as pd
import requests
from urllib.parse import urlparse
import random

def git_request_stats(git_repo_url):
    result=requests.get(git_repo_url)
    html = lx.fromstring(result.text)
    pr_tab = html.xpath('//a[@id="pull-requests-tab"]')
    fork_element = html.xpath('//*[@id="repo-network-counter"]')[0]
    star_element = html.xpath('//*[@id="repo-stars-counter-star"]')[0]
    issues_tab = html.xpath('//a[@id="issues-tab"]')
    
    open_pr_count = merged_pr_count = forks = stars = open_issues_count = closed_issues_count =0
    #PR Stuff
    if pr_tab:
        issues_url = git_repo_url + "/pulls"
        result=requests.get(issues_url)
        html = lx.fromstring(result.text)
        open_pr = html.xpath('//a[@data-ga-click="Pull Requests, Table state, Open"]')
        merged_pr = html.xpath('//a[@data-ga-click="Pull Requests, Table state, Closed"]')
        
        if open_pr:
            open_pr_content = open_pr[0].text_content()
            open_pr_count = int(open_pr_content.split()[0].replace(',', ''))

        if merged_pr:
            merged_pr_content = merged_pr[0].text_content()
            merged_pr_count = int(merged_pr_content.split()[0].replace(',', ''))
    
    #forks_stars Stuff
    try:
        forks = int(fork_element.text)
    except:
        if(fork_element.text and fork_element.text[-1] == 'k'):
            forks = int(float(fork_element.text[:-1])*1000)
        else:
            forks = fork_element.text
    
    try:
        stars = int(star_element.text)
    except:
        if(star_element.text and star_element.text[-1] == 'k'):
            stars = int(float(star_element.text[:-1])*1000)
        else:
            stars = star_element.text
    
    #Issues Stuff
    if issues_tab:
        issues_url = git_repo_url + "/issues"
        result=requests.get(issues_url)
        html = lx.fromstring(result.text)
        open_issues = html.xpath('//a[@data-ga-click="Issues, Table state, Open"]')
        closed_issues = html.xpath('//a[@data-ga-click="Issues, Table state, Closed"]')
        if open_issues:
            open_issues_content = open_issues[0].text_content()
            open_issues_count = int(open_issues_content.split()[0].replace(',', ''))
        
        if closed_issues:
            closed_issues_content = closed_issues[0].text_content()
            closed_issues_count = int(closed_issues_content.split()[0].replace(',', ''))
        
    return open_pr_count, merged_pr_count, forks, stars, open_issues_count, closed_issues_count

def release_frequency(github_repo_url):
    release_dates = []
    for commit in Repository(github_repo_url, only_releases=True).traverse_commits():
        release_dates.append(commit.committer_date)
    
    if(len(release_dates)>1):
        # Calculate the difference between the last and first release dates
        difference = release_dates[-1] - release_dates[0]
        
        # Get the number of days from the timedelta object
        number_of_days = difference.days
        number_of_releases = len(release_dates)
        
        try:
            # Convert release frequency to months and years
            release_frequency_months = number_of_releases / (number_of_days / 30)  # Assuming 30 days in a month
            release_frequency_months = round(release_frequency_months, 2)
            release_frequency_years = number_of_releases / (number_of_days / 365)  # Assuming 365 days in a year
            release_frequency_years = round(release_frequency_years, 2)
        except:
            return 0,0
            

    else:
        return 0,0

    return release_frequency_months, release_frequency_years


def get_loc(github_repo_url,commit_dates):
    total_added = 0
    total_deleted = 0
        
    if(len(commit_dates)!=0):
        first_commit_date = commit_dates[0]
        last_commit_date = commit_dates[-1]
    
        # Initialize LinesCount
        lines_count = LinesCount(path_to_repo=github_repo_url,since=first_commit_date,to=last_commit_date)
        
        total_added = lines_count.count_added()
        total_deleted = lines_count.count_removed()
    else:
        return 0
        
    LOC = {}
    for key in total_added:
        if key in total_deleted:
            LOC[key] = total_added[key] - total_deleted[key]
        else:
            LOC[key] = total_added[key]

    Filtered_LOC = {}
    
    for file in LOC:
        if(file is not None and len(file)>3 and file[-3:] == ".js"):
            if("test" not in file and "tests" not in file and ".spec.js" not in file and ".test.js" not in file and ".spec.ts" not in file and ".test.ts" not in file and "spec" not in file):
                if("node_modules\\" not in file and "public\\" not in file and "build\\" not in file and "test\\" not in file):
                    Filtered_LOC[file] = LOC[file]
    return sum(Filtered_LOC.values()), sum(total_added.values()), sum(total_deleted.values())


def get_lines_added_deleted_last_one_and_half_years(github_repo_url):
    total_added = 0
    total_deleted = 0
    current_date = datetime.now()
    one_and_a_half_years_ago = current_date - timedelta(days=547)  # Assuming 1 year = 365 days so 547 for 1.5 years
    year = one_and_a_half_years_ago.year
    month = one_and_a_half_years_ago.month
    day = one_and_a_half_years_ago.day
    one_and_a_half_years_ago_date = datetime(year, month, day)
    lines_count = LinesCount(path_to_repo=github_repo_url, since=one_and_a_half_years_ago_date, to=current_date)
    total_added = lines_count.count_added()
    total_deleted = lines_count.count_removed()
    return sum(total_added.values()), sum(total_deleted.values())

def is_readme_updated_in_last_one_and_half_years(github_repo_url):
    total_added = 0
    total_deleted = 0

    # Get the current date
    current_date = datetime.now()
    
    # Calculate the date 1.5 years ago
    one_and_a_half_years_ago = current_date - timedelta(days=547)  # Assuming 1 year = 365 days
    
    # Extract year, month, and day components
    year = one_and_a_half_years_ago.year
    month = one_and_a_half_years_ago.month
    day = one_and_a_half_years_ago.day
    
    one_and_a_half_years_ago_date = datetime(year, month, day)
    
    # Initialize LinesCount
    lines_count = LinesCount(path_to_repo=github_repo_url, since=one_and_a_half_years_ago_date, to=current_date)
    
    total_added = lines_count.count_added()

    total_deleted = lines_count.count_removed()
    
    lowercase_total_added = {key.lower() if key is not None else key: value for key, value in total_added.items()}
    lowercase_total_deleted = {key.lower() if key is not None else key: value for key, value in total_deleted.items()}

    try:
        readme_added = lowercase_total_added['readme.md']
        readme_deleted = lowercase_total_deleted['readme.md']
        if(readme_added+readme_deleted==0):
            readme_updated = False
        else:
            readme_updated = True
    except:
        readme_updated = False

    return readme_updated


def analyze_repository(url):
    total_commits = 0
    commit_dates = []
    contributors = set()

    try:
        total_commits, avg_commits_per_day, last_commit_before_months, unique_contributors = 0, 0, None, 0

        repo = Repository(url)
        for commit in repo.traverse_commits():
            total_commits += 1
            commit_dates.append(commit.committer_date)
            contributors.add(commit.author.email)

        if total_commits > 0:
            first_commit_date = min(commit_dates)
            last_commit_date = max(commit_dates)
            current_date = datetime.now(last_commit_date.tzinfo)
            delta = current_date - last_commit_date
            last_commit_before_months = delta.days // 30
            unique_contributors = len(contributors)
            total_days = (last_commit_date - first_commit_date).days + 1
            avg_commits_per_day = round((total_commits / total_days), 2) if total_days > 0 else 0

        open_prs_count, merged_prs_count, forks, stars, open_issues, resolved_issues = git_request_stats(url)
        version_release_frequency_months, version_release_frequency_years = release_frequency(url)
        lines_of_codes, total_added, total_deleted = get_loc(url,commit_dates)
        lines_added_one_and_half_year,lines_deleted_one_and_half_year = get_lines_added_deleted_last_one_and_half_years(url)
        readme_updated = is_readme_updated_in_last_one_and_half_years(url)
        

        return total_commits, avg_commits_per_day, last_commit_before_months, unique_contributors, forks, stars, open_prs_count, merged_prs_count, open_issues, resolved_issues, version_release_frequency_months, version_release_frequency_years, lines_of_codes, total_added, total_deleted, lines_added_one_and_half_year, lines_deleted_one_and_half_year, readme_updated
    except Exception as e:
        print(f"Error analyzing repository {url}: {e}")
        return None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

def fetch_and_analyze_repositories(git_repos, output_csv):
    unique_repos = set()
    data = []
    fieldnames = ['git_repository', 'total_commits', 'avg_commits_per_day', 'last_commit_before_months', 'git_unique_contributors', 'github_forks', 'github_stars', 'open_PRs', 'merged_PRs', 'open_issues', 'resolved_issues', 'version_release_frequency_months', 'version_release_frequency_years', 'LOC', 'total_lines_added', 'total_lines_deleted', 'lines_added_one_and_half_year', 'lines_deleted_one_and_half_year', 'readme_updated']
        
    for repo_url in git_repos:
        if repo_url and repo_url not in unique_repos:
            print(f"Analyzing repository: {repo_url}")
            result = analyze_repository(repo_url)
            if result[0] is not None:
                data.append({
                    fieldnames[0]: repo_url,
                    fieldnames[1]: result[0],
                    fieldnames[2]: result[1],
                    fieldnames[3]: result[2],
                    fieldnames[4]: result[3],
                    fieldnames[5]: result[4],
                    fieldnames[6]: result[5],
                    fieldnames[7]: result[6],
                    fieldnames[8]: result[7],
                    fieldnames[9]: result[8],
                    fieldnames[10]: result[9],
                    fieldnames[11]: result[10], 
                    fieldnames[12]: result[11],
                    fieldnames[13] : result[12],
                    fieldnames[14]: result[13],
                    fieldnames[15]: result[14],
                    fieldnames[16]: result[15],
                    fieldnames[17]: result[16],
                    fieldnames[18]: result[17]
                })
            unique_repos.add(repo_url)
        elif repo_url:
            print(f"Skipping duplicate repository: {repo_url}")
        else:
            print("Skipping repository: value not present")

    with open(output_csv, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)


input_file = '/kaggle/input/npm-samples/name_git_repo_nik.csv'
with open(input_file, 'r') as file:
    reader = csv.DictReader(file)
    # Handle potential header row
    is_header = True
    git_repos = []
    for row in reader:
        if is_header:
            is_header = False
            continue
        git_repos.append(row['latest_version_git_repo']) 

    git_repos = git_repos[15000:15020]
    # git_repos = ['https://github.com/angular/angular']
    print(len(git_repos))
        
fetch_and_analyze_repositories(git_repos, 'git_stats.csv')

1000
Analyzing repository: https://github.com/MohamedGamal-Dev/mono-rcl
Error analyzing repository https://github.com/MohamedGamal-Dev/mono-rcl: 404 Client Error: Not Found for url: https://github.com/MohamedGamal-Dev/mono-rcl
Analyzing repository: https://github.com/datocms/vue-datocms
Skipping repository: value not present
Skipping repository: value not present
Analyzing repository: https://github.com/jiraiyame/global-fetch
Error analyzing repository https://github.com/jiraiyame/global-fetch: 404 Client Error: Not Found for url: https://github.com/jiraiyame/global-fetch
Analyzing repository: https://github.com/ymzuiku/vanilla-device
Skipping repository: value not present
Analyzing repository: https://github.com/apollographql/apollo-server
Skipping repository: value not present
Analyzing repository: https://github.com/redradix/reduken
Skipping repository: value not present
Skipping repository: value not present
Analyzing repository: https://github.com/DougAnderson444/rollup-plugin-wit

KeyboardInterrupt: 

In [6]:
import pandas as pd
import requests
import lxml.html as lx
from tqdm import tqdm

def pull_request(git_repo_url):
    result=requests.get(git_repo_url)
    html = lx.fromstring(result.text)
    pr_tab = html.xpath('//a[@id="pull-requests-tab"]')
    if pr_tab:
        issues_url = git_repo_url + "/pulls"
        result=requests.get(issues_url)
        html = lx.fromstring(result.text)
        open_pr = html.xpath('//a[@data-ga-click="Pull Requests, Table state, Open"]')
        if open_pr:
            open_pr_content = open_pr[0].text_content()
            open_pr_count = int(open_pr_content.split()[0].replace(',', ''))
            open =  open_pr_count
        else:
            open = 0

        merged_pr = html.xpath('//a[@data-ga-click="Pull Requests, Table state, Closed"]')
        if merged_pr:
            merged_pr_content = merged_pr[0].text_content()
            merged_pr_count = int(merged_pr_content.split()[0].replace(',', ''))
            merged =  merged_pr_count
        else:
            merged = 0
        return open, merged    
    else:
        return 0,0

# Read the CSV file
df = pd.read_csv('/Users/tahers/Documents/SE_NPM_packages/git_stats_data.csv')

# Convert 'git_repo' column values to list
git_repos = df['git_repo'].tolist()
git_repos = git_repos[:2000]
print(len(git_repos))

values1 = []
values2 = []

# Iterate through the git_repos list, call the function, and store the returned values
for git_repo in tqdm(git_repos, desc="Processing git repos"):
    val1, val2 = pull_request(git_repo)
    values1.append(val1)
    values2.append(val2)

# Create a new DataFrame with the returned values
new_df = pd.DataFrame({'open_PRs': values1, 'merged_PRs': values2})

# Write the new DataFrame to a new CSV file
new_df.to_csv('pr_data_git_batch1.csv', index=False)

2000


Processing git repos: 100%|██████████| 2000/2000 [47:08<00:00,  1.41s/it] 
