In [1]:
### IMPORT EXCEPTION MODULES
from requests.exceptions import Timeout
from github import GithubException, UnknownObjectException, IncompletableObject

### IMPORT SYSTEM MODULES
from github import Github
import os, logging, pandas, csv, tempfile, shutil
from datetime import datetime, timezone, timedelta
from tqdm import tqdm
from pathlib import Path

from truckfactor.compute import main as compute_tf
import re
import unicodedata
from typing import Iterable, Tuple, List, Set, Dict, List
from collections import Counter

### IMPORT CUSTOM MODULES
import sys
sys.path.append('../')
import Settings as cfg
import Utilities as util
import subprocess, tempfile, shutil

from git import Repo, exc as git_exc
import time

# TESTER FOR BREAK IDENTIFICATION


In [71]:
def write_pauses_table(
        df: pandas.DataFrame,
        out_path: os.PathLike,
        authors: list[str] | None = None,
        *,
        user_col: str = "author_id",
        date_col: str = "created_at",
        tail_to_today: bool = False
    ) -> pandas.DataFrame:

    df[date_col] = pandas.to_datetime(df[date_col]).dt.normalize()

    if authors is None:
        authors = df[user_col].unique()

    rows = []
    
    count =0
    for dev in authors:
        user_df = df[df[user_col] == dev]
        pause_len_1 = len(user_df[date_col].dt.date.unique())
        pause_len_2 =len(user_df)
        if user_df.empty:
            continue

        active_days = sorted(user_df[date_col].dt.date.unique())
        current_row = [dev]

        for i in range(len(active_days) - 1):
            prev_day = active_days[i]
            next_day = active_days[i + 1]
            gap = (next_day - prev_day).days
            if gap > 1:
                # Inactivity starts the day after prev_day
                current_row.append(f"{(prev_day + pandas.Timedelta(days=1)).strftime('%Y-%m-%d')}/{next_day.strftime('%Y-%m-%d')}")
            else:
                count += 1


        if tail_to_today and active_days:
            today = _date.today()
            gap = (today - active_days[-1]).days
            if gap > 1:
                current_row.append(f"{active_days[-1]}/{today}")

        if len(current_row) > 1:
            rows.append(current_row)
    
    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", newline="",encoding="utf-8" ) as f:
        csv.writer(f, delimiter=",", quoting=csv.QUOTE_NONE).writerows(rows)

    return pandas.DataFrame(rows)

def get_commit_based_core_devs(commits, threshold=0.8):
    """
    commits: List[dict] where each dict contains at least the 'author' key.
    Example: [{'author': 'alice'}, {'author': 'bob'}, {'author': 'alice'}, ...]

    Returns: List of core developers (author names) who together authored >= threshold of commits.
    """
    # Count commits per developer
    author_commit_counts = Counter(commit["author_id"] for commit in commits)

    # Sort developers by number of commits (descending)
    sorted_authors = author_commit_counts.most_common()

    total_commits = sum(author_commit_counts.values())
    cumulative = 0
    core_devs = []

    for author, count in sorted_authors:
        cumulative += count
        core_devs.append(author)
        if cumulative / total_commits >= threshold:
            break

    return core_devs

def identifyInactivityPeriods(organizationFolder, organization, project):
    """Identifies the inactivity periods of the developers in the organization"""
    #url = "https://github.com/" + organization + "/" + project + ".git"
    #authors, emails = findCoreDevelopers(url, name=project)
    
    organizationFolder = organizationFolder + "/" + organization + "/" + project

    commits =  pandas.read_csv(organizationFolder + "/commit_list.csv", parse_dates=["created_at"], encoding="utf-8", header=0, sep=cfg.CSV_separator)

    commit_authors = get_commit_based_core_devs(commits.to_dict(orient='records'))

    pauses = write_pauses_table(commits, organizationFolder + "/pauses_commits.csv", commit_authors, user_col = "author_id", date_col="created_at")
    return commit_authors, pauses

In [167]:
def getFarOutThreshold(values, dev): ### If it is satisfying, move the function into UTILITIES
    import numpy
    th = 0
    q_3rd = numpy.percentile(values,75)
    q_1st = numpy.percentile(values,25)
    iqr = q_3rd-q_1st
    if iqr > 1:
        th = q_3rd + 3*iqr
    return th

def addToBreaksList(pauses, currentBreaks, th):
    for _, p in pauses.iterrows():
        if (p['len'] > th) and (p['dates'] not in currentBreaks.dates.tolist()):
            util.add(currentBreaks, [p['len'], p['dates'], th])
    return currentBreaks

def cleanClearBreaks(clearBreaks, breaks):
    for _, b in breaks.iterrows():
        clearBreaks = clearBreaks[clearBreaks.dates != b['dates']] # If it was in the long_breaks list, remove ot from there
    return clearBreaks

def identifyBreaks(pauses_dates_list, developer, window, shift,
                   debug_folder=None):           # NEW ARG
    '''
    Removes SURE BREAKS from windows to calculate Tfov
    and — with debug_folder — writes a per-window diagnostics CSV.
    '''
    breaks_df = pandas.DataFrame(columns=['len', 'dates', 'th'])
    diagnostics = []                             # NEW

    for row in pauses_dates_list:
        dev = developer
        intervals_list = [ x for x in row[0:]
                          if isinstance(x, str) and '/' in x and x.strip()]
        if not intervals_list:
            print(dev, 'has NO valid pauses')
            continue                      # <- don’t bail out; just skip

        clear_breaks = pandas.DataFrame(columns=['len', 'dates'])

        FPS_dt = datetime.strptime(intervals_list[0].split('/')[0], '%Y-%m-%d')
        LPE_dt = datetime.strptime(intervals_list[-1].split('/')[1], '%Y-%m-%d')

        win_start, win_end = FPS_dt, FPS_dt + timedelta(days=window)
        last_th = 0
        while win_end < LPE_dt:
            win_pauses_list = pandas.DataFrame(columns=['len', 'dates'])
            partially_included_pauses_list = pandas.DataFrame(columns=['len', 'dates'])

            for interval in intervals_list:
                int_start_str, int_end_str = interval.split('/')          # keep strings
                int_start_dt  = datetime.strptime(int_start_str, '%Y-%m-%d')
                int_end_dt    = datetime.strptime(int_end_str,   '%Y-%m-%d')
                pause_len = util.daysBetween(int_start_str, int_end_str)
                # fully inside
                if int_start_dt >= win_start and int_end_dt <= win_end:
                    util.add(win_pauses_list, [pause_len, interval])
                # touches boundary
                if ((int_start_dt <= win_end and int_end_dt > win_end) or
                    (int_end_dt >= win_start and int_start_dt < win_start)):
                    util.add(partially_included_pauses_list, [pause_len, interval])

            win_pauses = len(win_pauses_list)
            pauses = pandas.concat([win_pauses_list,
                                    partially_included_pauses_list],
                                    ignore_index=True)

            # --- decision logic (unchanged) ---------------------------------
            win_th = None
            added_flag = False
            if win_pauses >= 4:
                win_th = getFarOutThreshold(win_pauses_list['len'], dev)
                print("win_th:", win_th)
                if win_th > 0:
                    before = len(breaks_df)
                    breaks_df = addToBreaksList(pauses, breaks_df, win_th)
                    added_flag = len(breaks_df) > before
                    last_th = win_th
                elif last_th > 0:
                    before = len(breaks_df)
                    breaks_df = addToBreaksList(pauses, breaks_df, last_th)
                    added_flag = len(breaks_df) > before
            else:
                if last_th > 0:
                    before = len(breaks_df)
                    breaks_df = addToBreaksList(pauses, breaks_df, last_th)
                    added_flag = len(breaks_df) > before

                clear_breaks = cleanClearBreaks(clear_breaks, breaks_df)
                for _, p in pauses.iterrows():
                    if (p['len'] >= window and
                        p['dates'] not in clear_breaks.dates.tolist() and
                        p['dates'] not in breaks_df.dates.tolist()):
                        util.add(clear_breaks, p)
                        print(f"we added {p[0]}  becuase {p['len']} is greater than {window}")    

            # ----------- NEW: record diagnostics for this window -------------
            diagnostics.append({
                'win_start': win_start.date(),
                'win_end':   win_end.date(),
                'win_pauses': win_pauses,
                'pause_lengths': ';'.join(map(str, win_pauses_list['len'].tolist())),
                'partial_lengths': ';'.join(map(str, partially_included_pauses_list['len'].tolist())),
                'win_th': win_th,
                'last_th': last_th,
                'added_as_break': 'yes' if added_flag else 'no'
            })
            # -----------------------------------------------------------------

            win_start += timedelta(days=shift)
            win_end   = win_start + timedelta(days=window)


    return breaks_df


In [69]:
def _load_activity_csv(folder: str,
                       filename: str,
                       rename_map: Dict[str, str],
                       dev_login,
                       usecols: list[str] = None,
                       ) -> pandas.DataFrame:
    """
    Read *filename* in *folder*, rename to the canonical columns
    ('id','date','creator_login'), keep ONLY the specified dev, and
    return three columns.  On any problem → empty df.
    """
    path = os.path.join(folder, filename)
    try:
        df = pandas.read_csv(path, sep=cfg.CSV_separator, usecols=usecols)
    except FileNotFoundError:
        logging.info("File %s not found – skipping", path)
        return pandas.DataFrame(columns=["id", "date", "creator_login"])
    except Exception as e:
        logging.warning("Could not read %s: %s", path, e)
        return pandas.DataFrame(columns=["id", "date", "creator_login"])

    df = df.rename(columns=rename_map)
    # keep only the columns we need, ignore anything extra
    df = df[["id", "date", "creator_login"]]
    df = df[df.creator_login == dev_login]
    # allow str OR list[str]
    if isinstance(dev_login, list):
        df = df[df.creator_login.isin(dev_login)]
    else:
        df = df[df.creator_login == dev_login]
    return df.reset_index(drop=True)

def get_activities(folder: str, dev_login: str) -> pandas.DataFrame:
    """
    Build the developer's DAILY 'other-actions' table.
    Returns a dataframe whose index is the *action*
    ('issues/pull_requests', 'issues_comments', …) and whose
    columns are day-strings.
    """
    files = {
    "prs": (
        "prs_repo.csv",
        {"PR_id": "id", "created_at": "date", "created_by": "creator_login"},
    ),
    "prs_comments": (
        "prs_comments.csv",
        {"comment_id": "id", "created_at": "date", "created_by": "creator_login"},
    ),
    "issues": (
        "issues_repo.csv",
        {"issue_id": "id", "created_at": "date", "created_by": "creator_login"},
    ),
    "issues_comments": (
        "issues_comments_repo.csv",
        {"comment_id": "id", "created_at": "date", "created_by": "creator_login"},
    ),
    "issues_events": (
        "issues_events_repo.csv",
        {"event_id": "id", "created_at": "date", "created_by": "creator_login"},
    ),
    "issues_timeline": (
        "issues_timeline_repo.csv",
        {"event_id": "id", "created_at": "date", "created_by": "creator_login"},
    )
    }

    # ---------- read / filter every file ----------
    dfs = {}
    for key, (fname, rename_map) in files.items():
        dfs[key] = _load_activity_csv(folder, fname, rename_map, dev_login)

    # ---------- split issues vs PRs -------------
    # Old logic: issues endpoint also returns PRs; remove rows whose id
    # matches a PR id so we don’t double-count.
    if not dfs["issues"].empty and not dfs["prs"].empty:
        dfs["issues"] = dfs["issues"][~dfs["issues"].id.isin(dfs["prs"].id)]

    # ---------- build the day range -------------
    # Derive it from the *actual* activity we just read.
    #
    # 1) gather every non-empty dataframe
    non_empty = [df for df in dfs.values() if not df.empty]

    if non_empty:
        # 2) earliest / latest date across *all* action types
        min_date = min(df["date"].min() for df in non_empty)
        max_date = max(df["date"].max() for df in non_empty)
    else:
        # Developer has no activity at all → default to one-day range
        min_date = max_date = pandas.Timestamp.today()

    # 3) full, dense list of day strings
    day_cols = (
        pandas.date_range(
            start=pandas.to_datetime(min_date).normalize(),
            end=pandas.to_datetime(max_date).normalize(),
            freq="D",
        )
        .strftime("%Y-%m-%d")
        .tolist()
    )

    # ---------- helper to create one timeline row ----------
    def _timeline_row(action_name, df_raw):
        row = [action_name]
        if df_raw.empty:
            row += [0] * len(day_cols)
            return row
        counts = (
            pandas.to_datetime(df_raw["date"])
            .dt.date
            .value_counts()
            .to_dict()
        )
        for d in day_cols:
            row.append(counts.get(pandas.to_datetime(d).date(), 0))
        return row

    # ---------- compile all action rows ----------
    rows = []
    if not dfs["issues"].empty:
        rows.append(_timeline_row("issues", dfs["issues"]))
    if not dfs["issues_comments"].empty:
        rows.append(_timeline_row("issues_comments", dfs["issues_comments"]))
    if not dfs["issues_events"].empty:
        rows.append(_timeline_row("issues_events", dfs["issues_events"]))
    if not dfs["prs"].empty:
        rows.append(_timeline_row("pull_requests", dfs["prs"]))
    if not dfs["prs_comments"].empty:
        rows.append(_timeline_row("pull_requests_comments", dfs["prs_comments"]))

    # (commits are already encoded in coding_history_table, so we skip them here)

    actions = pandas.DataFrame(rows, columns=["action"] + day_cols).set_index("action")

    # ---------- cache to disk, same place as before ----------
    actions_folder = Path(folder) / cfg.actions_folder_name
    actions_folder.mkdir(parents=True, exist_ok=True)

    os.makedirs(actions_folder, exist_ok=True)
    actions_file = actions_folder / f"{dev_login}_actions_table.csv"
    actions.to_csv( actions_file , sep=cfg.CSV_separator, na_rep=cfg.CSV_missing, index=False)
    return actions

def splitBreak(break_limits, action_days, th):
    status = 'ACTIVE'  # NCUT: Non coding under threshold.
    previously = status
    period_start = ''

    break_range = break_limits.split('/')
    action_days.insert(0, break_range[0])
    action_days.append(break_range[1])
    

    period_detail = pandas.DataFrame(columns=['len', 'dates', 'th', 'label', 'previously'])
    for i in range(0, len(action_days) - 1):
        if status == 'ACTIVE':
            previously = status
            size = util.daysBetween(action_days[i], action_days[i + 1])
            if size > th:
                if size > cfg.gone_threshold:
                    status = 'GONE'
                else:
                    status = 'INACTIVE'
                dates = action_days[i] + '/' + action_days[i + 1]
                util.add(period_detail, [size, dates, th, status, previously])
            else:
                status = 'NCUT'
                period_start = action_days[i]
        elif (status == 'INACTIVE') | (status == 'GONE'):
            previously = status
            size = util.daysBetween(action_days[i], action_days[i + 1])
            if size < th:
                status = 'NCUT'
                period_start = action_days[i]
            else:
                residual = size - (th + 1)
                if residual > th:
                    # The sub-break is actually made of 2 breaks: Non-coding + Inactive/Gone
                    status = 'NON_CODING'
                    final_date = (datetime.strptime(action_days[i], "%Y-%m-%d") + timedelta(days=(th + 1))).strftime("%Y-%m-%d")
                    dates = action_days[i] + '/' + final_date
                    actual_size = util.daysBetween(action_days[i], final_date)
                    util.add(period_detail, [actual_size, dates, th, status, previously])

                    previously = status
                    if residual > cfg.gone_threshold:
                        status = 'GONE'
                    else:
                        status = 'INACTIVE'
                    dates = final_date + '/' + action_days[i + 1]
                    second_size = util.daysBetween(final_date, action_days[i + 1])
                    util.add(period_detail, [second_size, dates, th, status, previously])
                else:
                    # The sub-break becomes a Non-coding
                    status = 'NON_CODING'
                    dates = action_days[i] + '/' + action_days[i + 1]
                    util.add(period_detail, [size, dates, th, status, previously])
        elif status == 'NON_CODING':
            previously = status
            size = util.daysBetween(action_days[i], action_days[i + 1])
            if size > th:
                if size > cfg.gone_threshold:
                    status = 'GONE'
                else:
                    status = 'INACTIVE'
                #start = (datetime.strptime(action_days[i], "%Y-%m-%d") + dt.timedelta(days=th)).strftime("%Y-%m-%d")
                #dates = start + '/' + action_days[i + 1]
                dates = action_days[i] + '/' + action_days[i + 1]
                util.add(period_detail, [size, dates, th, status, previously])
            else:
                break_start = period_detail.at[0, 'dates'].split('/')[0]
                new_end = action_days[i + 1]
                period_detail.at[0, 'len'] = util.daysBetween(break_start, new_end)  # New size
                period_detail.at[0, 'dates'] = break_start + '/' + new_end  # New dates
                # Same th
                # Same status
                # Same previously
        else:  # (status=='NCUT')
            diff = util.daysBetween(action_days[i], action_days[i + 1])
            size = util.daysBetween(period_start, action_days[i + 1])
            if size > th:
                residual = size - (th + 1)
                if residual > th:
                    # The sub-break is actually made of 2 breaks: Non-coding + Inactive/Gone
                    status = 'NON_CODING'
                    final_date = (datetime.strptime(period_start, "%Y-%m-%d") + timedelta(days=(th + 1))).strftime("%Y-%m-%d")
                    dates = period_start + '/' + final_date
                    actual_size = util.daysBetween(period_start, final_date)
                    util.add(period_detail, [actual_size, dates, th, status, previously])

                    previously = status
                    if residual > cfg.gone_threshold:
                        status = 'GONE'
                    else:
                        status = 'INACTIVE'
                    dates = final_date + '/' + action_days[i + 1]
                    second_size = util.daysBetween(final_date, action_days[i + 1])
                    util.add(period_detail, [second_size, dates, th, status, previously])
                else:
                    # The sub-break becomes a Non-coding
                    status = 'NON_CODING'
                    dates = period_start + '/' + action_days[i + 1]
                    actual_size = util.daysBetween(period_start, action_days[i + 1])
                    util.add(period_detail, [actual_size, dates, th, status, previously])
    # A Final status 'INACTIVE', 'GONE' or 'NCUT' means an UNFREEZING ('NCUT' is not written into the detail list)

    last_end = period_detail.at[0, 'dates'].split('/')[1]
    if status == 'NCUT':
        if last_end == cfg.data_collection_date:
            status = cfg.NC
            start = last_end        
            size = util.daysBetween(start, last_end)
            util.add(period_detail,
                     [1, f"{start}/{start}", th, status, previously])
        else:
            status = previously
            break_start = period_detail.at[0, 'dates'].split('/')[0]
            new_end = action_days[i + 1]
            period_detail.at[0, 'len'] = util.daysBetween(break_start, new_end)  # New size
            period_detail.at[0, 'dates'] = f"{break_start}/{new_end}"
            last_end = new_end
            # Same th
            # Same status
            # Same previously
    if last_end == cfg.data_collection_date:
        period_detail.at[0, 'label'] += '(NOW)'
    else:
        util.add(period_detail, [1, f"{last_end}/{last_end}", 0, 'ACTIVE', status])
    return period_detail


In [None]:
def main():
    #identifyInactivityPeriods
    #"Resources/repositories.txt"
    repos_file= '../' + cfg.repos_file
    #"../Organizations"
    organizationFolder = cfg.main_folder


    #identifyBreaks
    win = cfg.sliding_window_size
    shift = cfg.shift

    with open(repos_file) as f:
        repos_file = f.readlines()
        for repo in repos_file:
            #take the end '\n' out
            repo = repo.rstrip('\n')
            organization, project = repo.split('/')

            print(f"Start Identifying inactivity periods for {organization}/{project}...")

            authors, pauses = identifyInactivityPeriods( organizationFolder, organization, project)
            pauses_list = pauses.values.tolist()
            print(f"Finihsed 1st step in identifying inactivity periods for {len(authors)} developers")
            #make the authors be only the first one 
            #authors = authors[:1]  # For testing, only take the first author
            output_folder = organizationFolder + '/' + repo + "/Results"

            os.makedirs(output_folder, exist_ok=True)

            for dev in authors:
                print(f"Started identifing breaks for {dev} in {organization}/{project}")
                breaks_df = identifyBreaks(pauses_list, developer=dev, window=win, shift=shift, debug_folder=output_folder )
                breaks_df.to_csv(os.path.join(output_folder, f"{dev}_breaks.csv"),
                                 
                sep=cfg.CSV_separator, na_rep=cfg.CSV_missing, index=False, lineterminator="\n")
                

                # 1)  load or build the ACTIONS table -------------------------------
                workingFolder = os.path.join(organizationFolder, repo)
                
                actions_path = Path(workingFolder) / f"{dev}_actions_table.csv"


                if actions_path.is_file():
                    user_actions = pandas.read_csv(actions_path, sep=cfg.CSV_separator, index_col=0)
                else:
                    user_actions = get_activities(workingFolder, dev)


                # 2)  find dev's break -----------------------------------------------

                labeled_breaks = pandas.DataFrame(columns=['len', 'dates', 'th', 'label', 'previously'])
                for i, b in breaks_df.iterrows():
                    # CHECK ACTIVITIES
                    break_duration = b['len']
                    break_dates = b['dates']
                    threshold = b['th']
                    break_range = break_dates.split('/')
                    inner_start = (datetime.strptime(break_range[0], "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
                    inner_end = (datetime.strptime(break_range[1], "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d")

                    break_actions = user_actions.loc[:, inner_start:inner_end]  # Gets only the chosen period
                    break_actions = break_actions.loc[~(break_actions == 0).all(axis=1)]  # Removes the actions not performed
                    # print a debug message for break_actions

                    is_activity_day = (break_actions != 0).any()  # List Of Columns With at least a Non-Zero Value
                    action_days = is_activity_day.index[is_activity_day].tolist()  # List Of Columns NAMES Having Column Names at least a Non-Zero Value
                    if len(break_actions) > 0:  # There are other activities: the Break is Non-coding
                        break_detail = splitBreak(break_dates, action_days, threshold)
                        # Exclude columns where all entries are NA
                        labeled_breaks = labeled_breaks.dropna(axis=1, how='all')
                        break_detail = break_detail.dropna(axis=1, how='all')
                        # Concatenate DataFrames
                        labeled_breaks = pandas.concat([labeled_breaks, break_detail], ignore_index=True)
                    else:  # No other activities: the Break is Inactive or Gone
                        print(f"Break {break_dates} for {dev} in {organization}/{project} is inactive or gone, no further processing needed.")
                out_csv = Path(output_folder) / f"{dev}_labeled_breaks.csv"
                labeled_breaks.to_csv(out_csv,
                                      sep=cfg.CSV_separator, na_rep=cfg.CSV_missing, index=False, quoting=None, lineterminator='\n')

                #for every file in the breaks folder
                print("✓ wrote output →            :", out_csv)

                # make feature set
        return labeled_breaks
                
                
labeled_breaks = main()

In [None]:
secrets = [
]
for new_token in secrets:
    ghub = Github(new_token)
    search_limit = ghub.get_rate_limit().search.remaining
    core_limit = ghub.get_rate_limit().core.remaining
    reset = ghub.get_rate_limit().core.reset
    #change the time to be in Mountain Standard Time (MST) 
    reset = reset.astimezone(tz=None)  # Convert to local timezone
    # Print the limits

    print(f"Search limit for token {new_token}:\n {search_limit}, {core_limit}, {reset} \n")


### Find Core Devs TF

In [3]:
#new
def findCoreDevelopers(
    url: str,
    dest_root: str | Path = ".tf_cache",
    *,
    name: str | None = None,
    branch: str | None = None,
    refresh: bool = False,
) -> tuple[list[str], list[str]]:        # <- correct annotation
    """
    Clone <url> (or reuse/refresh an existing clone) and run Truck‑Factor.
    Returns (authors, emails) – two parallel lists with the same length.
    """

    # ------------------------------------------------------------------ #
    # Paths
    # ------------------------------------------------------------------ #
    org, repo = url.rstrip("/").split("/")[-2:]
    repo = repo.removesuffix(".git")
    repo_path   = Path(cfg.main_folder) / org / repo          # actual repo
    cache_dir   = repo_path / dest_root                       # .tf_cache
    tf_csv      = cache_dir / "TruckFactor.csv"

    cache_dir.mkdir(parents=True, exist_ok=True)

    # ------------------------------------------------------------------ #
    # 1. Return cached result if possible
    # ------------------------------------------------------------------ #
    if tf_csv.is_file() and not refresh:
        cache_df = pandas.read_csv(
            tf_csv,
            sep=cfg.CSV_separator,
            encoding="utf-8",
        )
        return (
            cache_df["login"].tolist(),
            cache_df["email"].tolist(),
        )

    # ------------------------------------------------------------------ #
    # 2. Ensure we have a local clone
    # ------------------------------------------------------------------ #
    try:
        if (cache_dir / ".git").is_dir():
            repo = Repo(cache_dir)
            if refresh:
                repo.git.fetch("--all", "--prune")
            if branch:
                repo.git.checkout(branch)
                if refresh:
                    repo.git.pull()
        else:
            # Empty dir or non‑existent – clone afresh
            if cache_dir.exists():
                shutil.rmtree(cache_dir, ignore_errors=True)
            repo = Repo.clone_from(url, cache_dir, branch=branch)
    except git_exc.GitCommandError as e:
        raise RuntimeError(f"Git failed: {e.stderr or e}") from e

    # ------------------------------------------------------------------ #
    # 3. Compute Truck Factor (this calls your patched compute_tf)
    # ------------------------------------------------------------------ #
    tf, critical_sha, authors, emails = compute_tf(str(cache_dir))

    # Always lists from here on
    authors = list(authors)
    emails  = list(emails)

    # ------------------------------------------------------------------ #
    # 4. Cache the result for next time
    # ------------------------------------------------------------------ #
    pandas.DataFrame({"login": authors, "email": emails}).to_csv(
        tf_csv,
        sep=cfg.CSV_separator,
        index=False,
        lineterminator="\n",
        encoding="utf-8",
    )

    return authors, emails
#old
def findCoreDevelopers(
    url: str,
    dest_root: str | Path = ".tf_cache",
    *,
    name: str | None = None,
    branch: str | None = None,
    refresh: bool = False,
) -> tuple[int, str, list[str]]:
    """
    Clone <url> (or reuse/refresh an existing clone) and run Truck-Factor.
    Returns (tf, critical_sha, authors).
    """
    # --------------------------------------------------------------------- #
    dest_root = Path(dest_root).expanduser().resolve()    # .../rails/rails
    name = name or url.rstrip("/").split("/")[-1].removesuffix(".git")
    dest = dest_root / name
    tf_cache  = dest / ".tf_cache"
    tf_cache.mkdir(parents=True, exist_ok=True)                  
    tf_csv = dest / "TruckFactor.csv"
    
    clone_path = dest / name                          # .../.tf_cache/rails

    if tf_csv.is_file():
        logging.info("TF cache hit – using %s", tf_csv)
        return pandas.read_csv(tf_csv, encoding="utf-8")["login"].tolist()

    

    repo = None
    # --------------------------------------------------------------------- #
    try:
        if clone_path.exists():
            try:
                repo = Repo(clone_path)
            except git_exc.InvalidGitRepositoryError:
                # Directory exists but isn't a repo – start fresh
                shutil.rmtree(dest, ignore_errors=True)
                repo = Repo.clone_from(url, to_path=dest, branch=branch)
            else:
                # Repo is valid – refresh if asked
                if refresh:
                    repo.git.fetch("--all", "--prune")
                if branch:
                    repo.git.checkout(branch)
                    if refresh:
                        repo.git.pull()
        else:
            repo = Repo.clone_from(url, clone_path, branch=branch)
    except git_exc.GitCommandError as e:
        raise RuntimeError(f"Git failed: {e.stderr or e}") from e

    # --------------------------------------------------------------------- #
    # Ensure the repo is NOT empty (at least one commit reachable)
    if not list(repo.iter_commits('--all', max_count=1)):
        # Something went wrong – start over with a clean clone
        shutil.rmtree(dest, ignore_errors=True)
        repo = Repo.clone_from(url, clone_path, branch=branch)

    # --------------------------------------------------------------------- #
    # Truck-Factor
    #if the truck factor file does not exist, we compute it
    if not tf_csv.is_file():
        print("Computing Truck Factor for", clone_path)
        tf, critical_sha, authors = compute_tf(str(clone_path))
        
    else:
        print("Using cached Truck Factor from %s", tf_csv)
    

    pandas.DataFrame(authors, columns=["login"])\
      .to_csv(tf_csv ,
              sep=cfg.CSV_separator,
              index=False,
              lineterminator="\n",
              encoding="utf-8")

    return authors

# Test Stand

In [170]:
def main():
    #identifyInactivityPeriods
    #"Resources/repositories.txt"
    repos_file= '../' + cfg.repos_file
    #"../Organizations"
    ORG_ROOT = cfg.main_folder

    #identifyBreaks
    win = cfg.sliding_window_size
    shift = cfg.shift
    
    with open(repos_file) as f:
        repos_file = f.readlines()
        for repo in repos_file:
            #take the end '\n' out
            repo = repo.rstrip('\n')
            organization, project = repo.split('/')

            organizationFolder = ORG_ROOT + "/" + organization + "/" + project
            
            print(f"Start Identifying inactivity periods for {organization}/{project}...")

            #authors, pauses = identifyInactivityPeriods( ORG_ROOT, organization, project)
            #read  pasues csv this file "C:\Users\samut\OneDrive\Documents\GitHub\developersInactivityAnalysisCOPY\Organizations\rails\rails\pauses_commits.csv"
            pauses = pandas.read_csv(r"C:\Users\samut\OneDrive\Documents\GitHub\developersInactivityAnalysisCOPY\Organizations\rails\rails\pauses_commits.csv",header=None, encoding="utf-8", sep=cfg.CSV_separator)
            #first column of pauses is the author_id, the rest are the pauses and rist row 
            authors = pauses.iloc[:, 0].unique().tolist()

            output_folder = organizationFolder + '/' + repo + "/Results"
            devs_with_pauses = pauses[ 0].dropna().unique()
        
            for dev in devs_with_pauses:
                row_series = pauses.loc[pauses[0] == dev].iloc[0]

                # dev_pauses is now e.g.
                # ['byroot', '2024-12-20/2024-12-27', '2024-12-28/2024-12-29', ...]

                dev_pauses = [dev] + [p for p in row_series[1:] if isinstance(p, str)]
                breaks_df  = identifyBreaks(
                    pauses_dates_list=[dev_pauses],
                    developer=dev,
                    window=win,
                    shift=shift,
                )

                breaks_df.to_csv(Path(organizationFolder + f"/{dev}_breaks.csv"),
                                sep=cfg.CSV_separator,
                                index=False)
                print(f"{dev}: {len(breaks_df)} breaks")

  

            #    # 2)  find dev's break -----------------------------------------------

            #    labeled_breaks = pandas.DataFrame(columns=['len', 'dates', 'th', 'label', 'previously'])
            #    for i, b in breaks_df.iterrows():
            #        # CHECK ACTIVITIES
            #        break_duration = b['len']
            #        break_dates = b['dates']
            #        threshold = b['th']
            #        break_range = break_dates.split('/')
            #        inner_start = (datetime.strptime(break_range[0], "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
            #        inner_end = (datetime.strptime(break_range[1], "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d")

            #        break_actions = user_actions.loc[:, inner_start:inner_end]  # Gets only the chosen period
            #        break_actions = break_actions.loc[~(break_actions == 0).all(axis=1)]  # Removes the actions not performed
            #        # print a debug message for break_actions

            #        is_activity_day = (break_actions != 0).any()  # List Of Columns With at least a Non-Zero Value
            #        action_days = is_activity_day.index[is_activity_day].tolist()  # List Of Columns NAMES Having Column Names at least a Non-Zero Value
            #        if len(break_actions) > 0:  # There are other activities: the Break is Non-coding
            #            break_detail = splitBreak(break_dates, action_days, threshold)
            #            # Exclude columns where all entries are NA
            #            labeled_breaks = labeled_breaks.dropna(axis=1, how='all')
            #            break_detail = break_detail.dropna(axis=1, how='all')
            #            # Concatenate DataFrames
            #            labeled_breaks = pandas.concat([labeled_breaks, break_detail], ignore_index=True)
            #        else:  # No other activities: the Break is Inactive or Gone
            #            print(f"Break {break_dates} for {dev} in {organization}/{project} is inactive or gone, no further processing needed.")
            #    out_csv = Path(output_folder) / f"{dev}_labeled_breaks.csv"
            #    labeled_breaks.to_csv(out_csv,
            #                          sep=cfg.CSV_separator, na_rep=cfg.CSV_missing, index=False, quoting=None, lineterminator='\n')

            #    #for every file in the breaks folder
            #    print("✓ wrote output →            :", out_csv)

            #    # make feature set
        #return labeled_breaks
                
                
main()


Start Identifying inactivity periods for rails/rails...
win_th: 22.0
byroot: 0 breaks
kamipo: 0 breaks
we added 909  becuase 909 is greater than 90
fatkodima: 0 breaks


  print(f"we added {p[0]}  becuase {p['len']} is greater than {window}")


# Model Building

In [26]:
def splitBreak(break_limits, action_days, th):
    print("==== START splitBreak ====")
    print("break_limits:", break_limits)
    print("original action_days:", action_days)
    print("threshold:", th)
    
    status = 'ACTIVE'
    previously = status
    period_start = ''
    break_range = break_limits.split('/')
    action_days = action_days.copy()
    action_days.insert(0, break_range[0])
    action_days.append(break_range[1])
    
    print("full action_days:", action_days)

    period_detail = pandas.DataFrame(columns=['len', 'dates', 'th', 'label', 'previously'])
    for i in range(0, len(action_days) - 1):
        print(f"\n--- Loop {i} ---")
        print("Status:", status)
        print("Previously:", previously)
        print("Window:", action_days[i], "->", action_days[i + 1])
        
        if status == 'ACTIVE':
            size = util.daysBetween(action_days[i], action_days[i + 1])
            print("Interval size:", size)
            if size > th:
                status = 'GONE' if size > cfg.gone_threshold else 'INACTIVE'
                dates = f"{action_days[i]}/{action_days[i + 1]}"
                util.add(period_detail, [size, dates, th, status, previously])
                print("New period:", size, dates, status)
            else:
                status = 'NCUT'
                period_start = action_days[i]
                print("NCUT Start:", period_start)
                
        elif status in ['INACTIVE', 'GONE']:
            size = util.daysBetween(action_days[i], action_days[i + 1])
            print("Interval size:", size)
            if size < th:
                status = 'NCUT'
                period_start = action_days[i]
            else:
                residual = size - (th + 1)
                print("Residual:", residual)
                if residual > th:
                    status = 'NON_CODING'
                    final_date = (datetime.strptime(action_days[i], "%Y-%m-%d") + timedelta(days=(th + 1))).strftime("%Y-%m-%d")
                    dates = f"{action_days[i]}/{final_date}"
                    util.add(period_detail, [util.daysBetween(action_days[i], final_date), dates, th, status, previously])
                    print("NON_CODING period:", dates)

                    previously = status
                    status = 'GONE' if residual > cfg.gone_threshold else 'INACTIVE'
                    dates = f"{final_date}/{action_days[i + 1]}"
                    util.add(period_detail, [util.daysBetween(final_date, action_days[i + 1]), dates, th, status, previously])
                    print("Residual period:", dates)
                else:
                    status = 'NON_CODING'
                    dates = f"{action_days[i]}/{action_days[i + 1]}"
                    util.add(period_detail, [size, dates, th, status, previously])
                    print("NON_CODING period:", dates)
                    
        elif status == 'NON_CODING':
            size = util.daysBetween(action_days[i], action_days[i + 1])
            print("NON_CODING interval size:", size)
            if size > th:
                status = 'GONE' if size > cfg.gone_threshold else 'INACTIVE'
                dates = f"{action_days[i]}/{action_days[i + 1]}"
                util.add(period_detail, [size, dates, th, status, previously])
                print("Post NON_CODING transition to", status, dates)
            else:
                break_start = period_detail.at[0, 'dates'].split('/')[0]
                new_end = action_days[i + 1]
                new_size = util.daysBetween(break_start, new_end)
                period_detail.at[0, 'len'] = new_size
                period_detail.at[0, 'dates'] = f"{break_start}/{new_end}"
                print("Expanded NON_CODING period to:", break_start, "->", new_end)
                
        else:  # NCUT
            diff = util.daysBetween(action_days[i], action_days[i + 1])
            size = util.daysBetween(period_start, action_days[i + 1])
            print("NCUT combined size:", size)
            if size > th:
                residual = size - (th + 1)
                print("Residual:", residual)
                if residual > th:
                    status = 'NON_CODING'
                    final_date = (datetime.strptime(period_start, "%Y-%m-%d") + timedelta(days=(th + 1))).strftime("%Y-%m-%d")
                    dates = f"{period_start}/{final_date}"
                    util.add(period_detail, [util.daysBetween(period_start, final_date), dates, th, status, previously])
                    previously = status
                    status = 'GONE' if residual > cfg.gone_threshold else 'INACTIVE'
                    dates = f"{final_date}/{action_days[i + 1]}"
                    util.add(period_detail, [util.daysBetween(final_date, action_days[i + 1]), dates, th, status, previously])
                else:
                    status = 'NON_CODING'
                    dates = f"{period_start}/{action_days[i + 1]}"
                    util.add(period_detail, [size, dates, th, status, previously])
    
    # Final segment
    last_end = period_detail.at[0, 'dates'].split('/')[1]
    if status == 'NCUT':
        if last_end == cfg.data_collection_date:
            status = cfg.NC
            start = last_end
            util.add(period_detail, [1, f"{start}/{start}", th, status, previously])
        else:
            status = previously
            break_start = period_detail.at[0, 'dates'].split('/')[0]
            new_end = action_days[i + 1]
            period_detail.at[0, 'len'] = util.daysBetween(break_start, new_end)
            period_detail.at[0, 'dates'] = f"{break_start}/{new_end}"
            last_end = new_end

    return period_detail


In [28]:
def test_splitBreak():

    # Define test inputs
    break_limits = '2023-01-01/2023-12-31'
    action_days = ['2023-02-01', '2023-04-01', '2023-06-01', '2023-08-01', '2023-11-01']
    dev= 'mattdowle'
    output_folder = cfg.main_file_path + '/Organizations/Rdatatable/data.table'+ "/Results"
    #read a csv at os.path.join(output_folder, f"{dev}_breaks.csv")
    organizationFolder = cfg.main_folder

    workingFolder = os.path.join(organizationFolder, "Rdatatable/data.table")

    breaks_df = pandas.read_csv(os.path.join(output_folder, f"{dev}_breaks.csv"),
                                    sep=cfg.CSV_separator, index_col=False)
    actions_path = Path(output_folder) / f"{dev}_actions_table.csv"
    if actions_path.is_file():
        user_actions = pandas.read_csv(actions_path, sep=cfg.CSV_separator, index_col=0)
    else:
        user_actions = get_activities(workingFolder, dev)


    # 2)  find dev's break -----------------------------------------------

    labeled_breaks = pandas.DataFrame(columns=['len', 'dates', 'th', 'label', 'previously'])
    for i, b in breaks_df.iterrows():
        # CHECK ACTIVITIES
        break_duration = b['len']
        break_dates = b['dates']
        threshold = b['th']
        break_range = break_dates.split('/')
        inner_start = (datetime.strptime(break_range[0], "%Y-%m-%d") + timedelta(days=1)).strftime("%Y-%m-%d")
        inner_end = (datetime.strptime(break_range[1], "%Y-%m-%d") - timedelta(days=1)).strftime("%Y-%m-%d")

        break_actions = user_actions.loc[:, inner_start:inner_end]  # Gets only the chosen period
        break_actions = break_actions.loc[~(break_actions == 0).all(axis=1)]  # Removes the actions not performed
        # print a debug message for break_actions

        is_activity_day = (break_actions != 0).any()  # List Of Columns With at least a Non-Zero Value
        action_days = is_activity_day.index[is_activity_day].tolist()  # List Of Columns NAMES Having Column Names at least a Non-Zero Value
        if len(break_actions) > 0:  # There are other activities: the Break is Non-coding
            break_detail = splitBreak(break_dates, action_days, threshold)
            # Exclude columns where all entries are NA
            if break_detail is not None and not break_detail.empty:
                break_detail = break_detail.dropna(axis=1, how='all')
                labeled_breaks = labeled_breaks.dropna(axis=1, how='all')
                labeled_breaks = pandas.concat([labeled_breaks, break_detail], ignore_index=True)
            else:
                print(f"Warning: No detail returned for break {break_dates}")
        else:  # No other activities: the Break is Inactive or Gone
            print(f"Break {break_dates} for {dev} in our repo is inactive or gone, no further processing needed.")


    out_csv = Path(output_folder) / f"{dev}_labeled_breaks.csv"
    labeled_breaks.to_csv(out_csv,
                          sep=cfg.CSV_separator,  index=False, quoting=None, lineterminator='\n')

    #for every file in the breaks folder
    print("✓ wrote output →            :", out_csv)

    # make feature set
    return labeled_breaks

test_splitBreak()

actions folder: ..\Organizations\Rdatatable\data.table\Actions_Tables
actions file: ..\Organizations\Rdatatable\data.table\Actions_Tables\mattdowle_actions_table.csv
==== START splitBreak ====
break_limits: 2014-08-19/2014-09-24
original action_days: ['2014-09-10', '2014-09-21']
threshold: 34.0
full action_days: ['2014-08-19', '2014-09-10', '2014-09-21', '2014-09-24']

--- Loop 0 ---
Status: ACTIVE
Previously: ACTIVE
Window: 2014-08-19 -> 2014-09-10
Interval size: 22
NCUT Start: 2014-08-19

--- Loop 1 ---
Status: NCUT
Previously: ACTIVE
Window: 2014-09-10 -> 2014-09-21
NCUT combined size: 33

--- Loop 2 ---
Status: NCUT
Previously: ACTIVE
Window: 2014-09-21 -> 2014-09-24
NCUT combined size: 36
Residual: 1.0
==== START splitBreak ====
break_limits: 2014-11-24/2015-06-19
original action_days: ['2014-12-01', '2014-12-08', '2014-12-12', '2014-12-16', '2014-12-22', '2015-01-05', '2015-01-10', '2015-01-13', '2015-01-17', '2015-03-16', '2015-04-23', '2015-04-29', '2015-06-01', '2015-06-08', '

Unnamed: 0,len,dates,th,label,previously
0,36,2014-08-19/2014-09-24,34.0,NON_CODING,ACTIVE
1,207,2014-11-24/2015-06-19,21.0,NON_CODING,ACTIVE
2,58,2015-01-17/2015-03-16,21.0,GONE,ACTIVE
3,38,2015-03-16/2015-04-23,21.0,NON_CODING,ACTIVE
4,33,2015-04-29/2015-06-01,21.0,GONE,ACTIVE
...,...,...,...,...,...
194,24,2023-11-01/2023-11-25,14.0,INACTIVE,NON_CODING
195,212,2020-09-27/2021-04-27,50.0,NON_CODING,ACTIVE
196,259,2021-11-03/2022-07-20,64.0,NON_CODING,ACTIVE
197,505,2022-07-21/2023-12-08,64.0,NON_CODING,ACTIVE
