In [1]:
import pandas as pd

# Load data
csv_path = '/Users/hanfuhou/Downloads/2004to2024all.csv'
df = pd.read_csv(csv_path, low_memory=False)


df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df = df[df['created_at'].dt.year.between(2020, 2024)]

# Step 1: Total issues
total_issues = len(df)

# Step 2: Accessibility mentions in labels or title or body
df['labels'] = df['labels'].astype(str).str.lower()
df['title'] = df['title'].astype(str).str.lower()
df['body'] = df['body'].astype(str).str.lower()
accessibility_issues = df[
    df['labels'].str.contains('accessibility|a11y|screen reader|contrast|aria|colorblind|blind|deaf|hearing|ally', na=False) |
    df['title'].str.contains('accessibility|a11y|screen reader|contrast|aria|colorblind|blind|deaf|hearing|ally', na=False) |
    df['body'].str.contains('accessibility|a11y|screen reader|contrast|aria|colorblind|blind|deaf|hearing|ally', na=False)
]
accessibility_count = len(accessibility_issues)
accessibility_ratio = accessibility_count / total_issues

# Step 3: Comments statistics
df['comments_count'] = pd.to_numeric(df['comments'], errors='coerce')
comments_mean = df['comments_count'].mean()
comments_median = df['comments_count'].median()

# Step 4: Author association distribution
author_dist = df['author_association'].value_counts(dropna=False)

# Step 5: Issue states and reasons
state_counts = df['state'].value_counts(dropna=False)
state_reason_counts = df['state_reason'].value_counts(dropna=False)

# === Output ===
print(f"Total issues (2020–2024): {total_issues}")
print(f"Accessibility-related issues: {accessibility_count} ({accessibility_ratio:.2%})")
print(f"Average comments per issue: {comments_mean:.2f}")
print(f"Median comments per issue: {comments_median}")
print("\nAuthor Association Distribution:")
print(author_dist)
print("\nIssue State Counts:")
print(state_counts)
print("\nState Reason Counts:")
print(state_reason_counts)

Total issues (2020–2024): 232165
Accessibility-related issues: 201461 (86.77%)
Average comments per issue: 4.29
Median comments per issue: 2.0

Author Association Distribution:
author_association
NONE            96392
CONTRIBUTOR     71950
OWNER           22771
MEMBER          22165
COLLABORATOR    18784
MANNEQUIN         103
Name: count, dtype: int64

Issue State Counts:
state
closed    165832
open       66333
Name: count, dtype: int64

State Reason Counts:
state_reason
completed      157402
NaN             64799
not_planned      8291
reopened         1534
duplicate         139
Name: count, dtype: int64


In [2]:
import pandas as pd
from scipy.stats import spearmanr
import ast
import re
from collections import Counter

# Load CSV
csv_path = '/Users/hanfuhou/Downloads/2004to2024all.csv' 
df = pd.read_csv(csv_path, low_memory=False)

# Convert created_at to datetime and filter for 2020–2024
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df = df[df['created_at'].dt.year.between(2020, 2024)]

# Clean 'author_association'
df['author_association'] = df['author_association'].fillna('NONE')

# Clean 'comments'
df['comments'] = pd.to_numeric(df['comments'], errors='coerce').fillna(0)

# Group by author_association and compute mean/median
author_group = df.groupby('author_association')['comments'].agg(['count', 'mean', 'median']).reset_index()
author_group = author_group.sort_values('mean', ascending=False)

print("\n Author Association vs. Comments (Descriptive Stats):")
print(author_group)

# Correlation between author_association (ordinal encoded) and comments
mapping = {'NONE':0, 'CONTRIBUTOR':1, 'OWNER':2, 'MEMBER':3, 'COLLABORATOR':4, 'MANNEQUIN':5}
df['author_num'] = df['author_association'].map(mapping).fillna(0)
corr, pval = spearmanr(df['author_num'], df['comments'])

print(f"\n Spearman Correlation (Author Association vs. Comments): {corr:.4f} (p={pval:.4f})")

# Compute number of labels
def count_labels_safe(label_str):
    if pd.isna(label_str):
        return 0
    try:
        if label_str.strip().startswith('['):
            return len(ast.literal_eval(label_str))
        else:
            return len(label_str.split(',')) if ',' in label_str else 1
    except:
        return len(label_str.split(',')) if ',' in label_str else 1

df['label_count'] = df['labels'].apply(count_labels_safe)

# Correlation between label count and comments
label_corr, label_pval = spearmanr(df['label_count'], df['comments'])
print(f"\n Spearman Correlation (Label Count vs. Comments): {label_corr:.4f} (p={label_pval:.4f})")

label_words = [word.strip().lower() for l in df['labels'].dropna() for word in re.split(r'[,|;]', l)]
label_freq = Counter(label_words)
print("\nTop 10 Labels by Frequency:")
print(label_freq.most_common(10))


 Author Association vs. Comments (Descriptive Stats):
  author_association  count      mean  median
2          MANNEQUIN    103  6.582524     4.0
4               NONE  96392  5.213576     3.0
3             MEMBER  22165  4.944733     2.0
1        CONTRIBUTOR  71950  3.893621     2.0
0       COLLABORATOR  18784  3.564789     1.0
5              OWNER  22771  1.619472     0.0

 Spearman Correlation (Author Association vs. Comments): -0.2375 (p=0.0000)

 Spearman Correlation (Label Count vs. Comments): 0.1787 (p=0.0000)

Top 10 Labels by Frequency:
[("'default': false", 356358), ("'description': ''}", 75401), ('[]', 70121), ("'default': true", 52916), ("'description': ''}]", 49616), ("'description': none}", 29800), ("'name': 'bug'", 26764), ("'color': 'ededed'", 19384), ("'description': none}]", 18060), ("'name': 'enhancement'", 16705)]


In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from collections import Counter

# === Load Data ===
csv_path = '/Users/hanfuhou/Downloads/2004to2024all.csv' 
df = pd.read_csv(csv_path, low_memory=False)

df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df = df[df['created_at'].dt.year.between(2020, 2024)]


df['text'] = df['title'].fillna('') + ' ' + df['body'].fillna('')


tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf.fit_transform(df['text'])


lda = LatentDirichletAllocation(n_components=5, random_state=42)
topic_probs = lda.fit_transform(X_tfidf)

df['topic'] = np.argmax(topic_probs, axis=1)

# === Convert Comments & Stars Columns ===
df['comment_count'] = pd.to_numeric(df['comments'], errors='coerce').fillna(0)
df['stars'] = pd.to_numeric(df['score'], errors='coerce').fillna(0)

# === Topic Summary ===
topic_summary = df.groupby('topic').agg({
    'comment_count': ['mean'],
    'stars': ['mean'],
    'author_association': lambda x: dict(Counter(x))
}).reset_index()

print(topic_summary)

# === Top Keywords per Topic ===
feature_names = tfidf.get_feature_names_out()
for idx, topic in enumerate(lda.components_):
    top_words = [feature_names[i] for i in topic.argsort()[-10:][::-1]]
    print(f"Topic {idx} top words:", top_words)

  topic comment_count stars                                 author_association
                 mean  mean                                           <lambda>
0     0      4.096282   1.0  {'OWNER': 1817, 'NONE': 22730, 'COLLABORATOR':...
1     1      3.600888   1.0  {'COLLABORATOR': 3978, 'OWNER': 4855, 'NONE': ...
2     2      4.954966   1.0  {'COLLABORATOR': 5450, 'NONE': 27721, 'OWNER':...
3     3      3.068380   1.0  {'OWNER': 7899, 'NONE': 13063, 'MEMBER': 7084,...
4     4      5.932398   1.0  {'CONTRIBUTOR': 6395, 'NONE': 24743, 'OWNER': ...
Topic 0 top words: ['button', 'screen', 'focus', 'keyboard', 'https', 'issue', 'tab', 'behavior', 'bug', 'com']
Topic 1 top words: ['design', 'team', 'va', 'product', 'research', 'gov', 'review', 'labeled', 'content', 'accessibility']
Topic 2 top words: ['https', 'com', 'text', 'accessibility', 'images', 'color', 'image', 'user', 'contrast', 'githubusercontent']
Topic 3 top words: ['accessibility', 'https', 'aria', 'component', 'com', 'github'

In [11]:
import pandas as pd

df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df = df[df['created_at'].dt.year.between(2020, 2024)]

# Ensure topic exists
if 'topic' not in df.columns:
    print("Missing 'topic' column")
else:
    df['topic'] = pd.to_numeric(df['topic'], errors='coerce').fillna(-1)
    df['comment_count'] = pd.to_numeric(df['comments'], errors='coerce').fillna(0)
    df['author_association'] = df['author_association'].fillna("UNKNOWN")

    # Group and summarize
    pivot = df.groupby(['author_association', 'topic'])['comment_count'].mean().unstack().fillna(0)

    print("\nAverage Comment Count per Author Association × Topic (2020–2024):")
    print(pivot)


Average Comment Count per Author Association × Topic (2020–2024):
topic                      0         1         2         3         4
author_association                                                  
COLLABORATOR        2.772344  3.610357  4.100550  3.038782  6.028637
CONTRIBUTOR         3.422950  3.710367  5.344027  2.969461  4.370446
MANNEQUIN           6.940476  0.000000  6.375000  2.000000  4.444444
MEMBER              3.479429  5.080208  6.035600  4.373235  5.743517
NONE                4.948438  4.233313  5.308863  3.747991  6.446429
OWNER               1.292790  1.169310  2.294406  1.000633  3.981804


In [12]:
import pandas as pd

# Convert 'created_at' to datetime
df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')

# Filter data for years 2020 to 2024
df_filtered = df[df['created_at'].dt.year.between(2020, 2024)]

# Ensure comment count is numeric
df_filtered['comment_count'] = pd.to_numeric(df_filtered['comments'], errors='coerce').fillna(0)

# Filter for Topic 4 only
topic_4_issues = df_filtered[df_filtered['topic'] == 4]

# Sort by comment count descending
top_topic_4_issues = topic_4_issues.sort_values(by='comment_count', ascending=False)

# Select relevant columns
top_topic_4_info = top_topic_4_issues[['title', 'author_association', 'comment_count']].head(30)

print("Top 30 Most-Commented Issues in Topic 4 (2020–2024):")
print(top_topic_4_info.to_string(index=False))

Top 30 Most-Commented Issues in Topic 4 (2020–2024):
                                                                                                                                title author_association  comment_count
                                                                                                                                 Root              OWNER           1105
                                                                                      macOS 11 Big Sur compatibility on Apple Silicon        CONTRIBUTOR            864
                                               [Auto Generated Report]com.intellij.openapi.diagnostic.RuntimeExceptionWithAttachments       COLLABORATOR            748
                                                                                                                     So, what's next?              OWNER            736
                                                                                                 Microsoft 

In [13]:
import pandas as pd
import numpy as np

csv_path = '/Users/hanfuhou/Downloads/2004to2024all.csv' 
df = pd.read_csv(csv_path, low_memory=False)

df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df['created_year'] = df['created_at'].dt.year

# ➡️ Filter to years 2020–2024
df = df[df['created_year'].between(2020, 2024)]

# 2️⃣ Flag accessibility-labeled issues (in title or labels)
accessibility_keywords = ['a11y', 'accessibility', 'screenreader', 'aria', 'keyboard', 'alt text', 'alt-text', 'colorblind', 'blind', 'deaf', 'hearing', 'ally']
pattern = '|'.join(accessibility_keywords)

df['is_accessibility'] = (
    df['title'].fillna('').str.lower().str.contains(pattern) |
    df['labels'].fillna('').str.lower().str.contains(pattern)
)


yearly_issues = df.groupby('created_year').size().rename('total_issues')
yearly_a11y_issues = df[df['is_accessibility']].groupby('created_year').size().rename('accessibility_issues')

yearly_counts = pd.concat([yearly_issues, yearly_a11y_issues], axis=1).fillna(0).astype(int)


issue_status_yearly = df.groupby(['created_year', 'state']).size().unstack(fill_value=0)
issue_status_yearly['total'] = issue_status_yearly.sum(axis=1)
issue_status_yearly['%_open'] = issue_status_yearly['open'] / issue_status_yearly['total'] * 100
issue_status_yearly['%_closed'] = issue_status_yearly['closed'] / issue_status_yearly['total'] * 100


a11y_status_yearly = df[df['is_accessibility']].groupby(['created_year', 'state']).size().unstack(fill_value=0)
a11y_status_yearly['total'] = a11y_status_yearly.sum(axis=1)
a11y_status_yearly['%_open'] = a11y_status_yearly['open'] / a11y_status_yearly['total'] * 100
a11y_status_yearly['%_closed'] = a11y_status_yearly['closed'] / a11y_status_yearly['total'] * 100


print("\n Total & Accessibility Issues per Year (2020–2024):")
print(yearly_counts)

print("\n Issue Status (Open vs Closed) by Year (2020–2024):")
print(issue_status_yearly)

print("\n Accessibility Issues Status (Open vs Closed) by Year (2020–2024):")
print(a11y_status_yearly)


 Total & Accessibility Issues per Year (2020–2024):
              total_issues  accessibility_issues
created_year                                    
2020.0               38856                 14338
2021.0               40909                 15043
2022.0               44295                 16249
2023.0               50401                 17629
2024.0               57704                 18279

 Issue Status (Open vs Closed) by Year (2020–2024):
state         closed   open  total     %_open   %_closed
created_year                                            
2020.0         30823   8033  38856  20.673770  79.326230
2021.0         30921   9988  40909  24.415165  75.584835
2022.0         32858  11437  44295  25.820070  74.179930
2023.0         35295  15106  50401  29.971628  70.028372
2024.0         35935  21769  57704  37.725288  62.274712

 Accessibility Issues Status (Open vs Closed) by Year (2020–2024):
state         closed  open  total     %_open   %_closed
created_year                

In [121]:
import ast
from collections import Counter

csv_path = '/Users/hanfuhou/Downloads/2004to2024all.csv' 
df = pd.read_csv(csv_path, low_memory=False)

df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df['created_year'] = df['created_at'].dt.year

# ➡️ Filter to years 2020–2024
df = df[df['created_year'].between(2020, 2024)]
def extract_labels(label_str):
    if pd.isna(label_str):
        return []
    try:
        label_list = ast.literal_eval(label_str)
        if isinstance(label_list, list):
            return [item.get('name', '').lower() for item in label_list if isinstance(item, dict)]
    except:
        return []
    return []

df['parsed_labels'] = df['labels'].apply(extract_labels)
def normalize_label(label):
    label = label.lower()
    label = label.strip()
    label = label.replace('-', ' ')
    label = label.replace('_', ' ')
    return label
all_labels = [normalize_label(label) for sublist in df['parsed_labels'] for label in sublist]
label_counts = Counter(all_labels)
print("Top 50 labels in accessibility issues:")
print(label_counts.most_common(50))

Top 50 labels in accessibility issues:
[('bug', 26764), ('enhancement', 16705), ('accessibility', 14675), ('frontend', 4792), ('a11y', 4302), ('good first issue', 4138), ('help wanted', 3461), ('epic', 3387), ('documentation', 3015), ('design', 2891), ('feature request', 2349), ('stale', 2329), ('question', 2129), ('vsa', 2016), ('feature', 1919), ('0. needs triage', 1873), ('ux', 1823), ('type: bug', 1638), ('collab cycle feedback', 1548), ('content', 1373), ('public websites', 1357), ('front end', 1309), ('backend', 1309), ('fluent ui react components (v9)', 1268), ('vsa debt', 1154), ('sitewide', 1145), ('needs triage', 1141), ('area: accessibility', 1140), ('p2', 1112), ('collaboration cycle', 1108), ('platform design system team', 1090), ('type:bug', 1084), ('hacktoberfest', 1079), ('staging', 1053), ('user story', 1026), ('wontfix', 989), ('needs info', 987), ('cc dashboard', 983), ('fluent ui react (v8)', 974), ('qa', 951), ('area accessibility', 926), ('hce checkin', 906), ('du

In [123]:
from collections import Counter
from rapidfuzz import process, fuzz

label_aliases = {
    'accessibility': [
        'a11y', 'accessibility', 'area: accessibility', 'area accessibility',
        'type a11y ♿', 'a accessibility', 'type:accessibility', 'type accessibility',
        'a11y defect 3', 'a11ywcag', 'a11yttvalidated', 'wcag', 'wcag/2.1/fixing',
        'has reproducible steps', 'has workaround'
    ],
    'bug': [
        'bug', '🐞 bug', 'type: bug', 'type: bug 🐛', 'type: bug :bug:', 
        'type:bug', 'issue bug', '[type] bug', 'defect level 3', 'invalid'
    ],
    'enhancement': [
        'enhancement', 'feature', 'feature request', 'type: enhancement',
        'type: feature', '[type] enhancement', 'improvement', 'planned work',
        'unplanned work', 'mvp', 'support', 'spike', 'task'
    ],
    'design': [
        'ui', 'ux', 'layout', 'design', 'frontend', 'front end', 'va.gov frontend',
        'ui/ux', 'role: design :pencil2:', 'experience profile', 'identity',
        'profile', 'authenticated experience'
    ],
    'documentation': [
        'docs', 'documentation', 'readme', 'examples', 'tutorial',
        'content', 'user story', 'no code attached yet', 'qa standards'
    ],
    'community': [
        'help wanted', 'good first issue', 'epic', 'discussion', 'question',
        'hacktoberfest accepted', 'gssoc', 'gssoc ext', 'awaiting user reply'
    ],
    'triage': [
        'needs info', 'needs triage', '0. needs triage', 'needs priority',
        'needs grooming', 'needs: author feedback', 'triage: done',
        'triaged', 'blocked', 'severity: 2', 'severity: 3', 'p0', 'p1', 'p2',
        'p3', 'high prio', 'high', 'level1', 'level2', 'launch blocking',
        'backlog refinement', 'backlog','ready for development',
        'in progress'
    ],
    'resolution': [
        'wontfix', 'resolution soft close', 'duplicate', 'status: fixed', 
        'status: no recent activity', 'status resolved', 'released'
    ],
    'internal': [
        'vsa', 'vsa debt', 'cc dashboard', 'vsp contact center', 'sitewide', 
        'public websites', 'platform design system team', 'staging', 
        'hce checkin', 'facilities v1', 'va mobile app old', 'cms team',
        'ia centralized team', 'content ia centralized team', 'mod*', 'mod/b*',
        'bmt team 1', '1010 team', 'sf acc', 'acc', 'df', 'eng', 'dev', 'global',
        'collab cycle feedback', 'collaboration cycle', 'collab cycle touchpoint',
        'midpoint review', 'milestone tgt', 'vsa claims appeals', 'vsa healthcare exp',
        'ia governance team', 'benefits crew', 'mobile', 'mobile api',
        'html', 'fluent ui react (v8)', 'fluent ui react components (v9)',
        'bot services', 'adaptivecards web', 'powertoys win32',
        'greenkeeper', 'dependencies', 'mend: dependency security vulnerability',
        'analytics insights', 'web vitals', 'lighthouse audit', 'sap sf',
        'product ac', 'product/pro'
    ]
}

# Reverse map each alias to its group
label_map = {}
for category, labels in label_aliases.items():
    for label in labels:
        norm_label = normalize_label(label)
        label_map[norm_label] = category



# alias_strings = list(label_map.keys())
# alias_cat = list(label_map.values())
# Build lookup arrays
alias_strings = list(label_map.keys())
alias_cat = list(label_map.values())

def fuzzy_classify(label, threshold=90):
    result = process.extractOne(
        normalize_label(label),
        alias_strings,
        scorer=fuzz.ratio
    )
    if result:
        best, score, idx = result
        return alias_cat[idx] if score >= threshold else 'uncategorized'
    else:
        return 'uncategorized'

def classify_label(label):
    norm = normalize_label(label)
    if norm in label_map:
        return label_map[norm]
    return fuzzy_classify(norm)

label_categories = [classify_label(label) for label in all_labels]
category_counts = Counter(label_categories)

# Print results
total = sum(category_counts.values())
print("\n Label category distribution (normalized):")
for category, count in category_counts.items():
    print(f"{category}: {count} ({count / total:.1%})")


 Label category distribution (normalized):
enhancement: 27041 (6.6%)
uncategorized: 236437 (57.8%)
documentation: 6546 (1.6%)
bug: 33389 (8.2%)
community: 15558 (3.8%)
triage: 12640 (3.1%)
resolution: 5071 (1.2%)
design: 14687 (3.6%)
internal: 30022 (7.3%)
accessibility: 27883 (6.8%)


In [112]:
uncategorized_labels = [label for label in all_labels if classify_label(label) == 'uncategorized']


In [113]:
from collections import Counter

uncategorized_counts = Counter(uncategorized_labels)
print(" Top 100 uncategorized labels:")
print(uncategorized_counts.most_common(100))

 Top 100 uncategorized labels:
[('collab cycle feedback', 1548), ('fluent ui react components (v9)', 1268), ('p2', 1112), ('collaboration cycle', 1108), ('fluent ui react (v8)', 974), ('qa', 951), ('analytics insights', 729), ('disability experience', 726), ('bmt team 1', 725), ('lighthouse audit', 662), ('web vitals', 662), ('content ia centralized team', 652), ('dev', 634), ('vsa claims appeals', 606), ('task', 585), ('hacktoberfest accepted', 575), ('identity', 556), ('p3', 549), ('p1', 548), ('mod*', 542), ('gssoc ext', 528), ('authenticated experience', 523), ('planned work', 482), ('invalid', 441), ('1. to develop', 437), ('experience profile', 405), ('🔧 code', 403), ('sev 3', 400), ('sap sf', 380), ('collab cycle touchpoint', 379), ('must have', 377), ('va mobile app old', 376), ('has reproducible steps', 375), ('cms team', 368), ('profile', 356), ('ia governance team', 356), ('mobile api', 338), ('acc', 337), ('df', 333), ('1010 team', 326), ('wcag/2.1/fixing', 326), ('bot serv

In [115]:
df.head(3)


Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,active_lock_reason,sub_issues_summary,body,reactions,timeline_url,performed_via_github_app,state_reason,score,created_year,parsed_labels
0,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://github.com/0-Gixty-0/IKEA-Capstone-Gro...,2611610313,I_kwDOMvcFN86bqgLJ,67,[TASK] - Redesigning PostDetailModal,...,,"{'total': 0, 'completed': 0, 'percent_complete...",## Description\nRedesign the PostDetailModal t...,{'url': 'https://api.github.com/repos/0-Gixty-...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,,completed,1.0,2024.0,[task]
1,https://api.github.com/repos/0-ana/love-runnin...,https://api.github.com/repos/0-ana/love-running,https://api.github.com/repos/0-ana/love-runnin...,https://api.github.com/repos/0-ana/love-runnin...,https://api.github.com/repos/0-ana/love-runnin...,https://github.com/0-ana/love-running/issues/4,2604918582,I_kwDOL48F7s6bQ-c2,4,USER STORY: Defect: “Sign Up” Button Not Displ...,...,,"{'total': 0, 'completed': 0, 'percent_complete...",**Description:**\r\nThe “Sign up” button is no...,{'url': 'https://api.github.com/repos/0-ana/lo...,https://api.github.com/repos/0-ana/love-runnin...,,,1.0,2024.0,[]
2,https://api.github.com/repos/00-Evan/shattered...,https://api.github.com/repos/00-Evan/shattered...,https://api.github.com/repos/00-Evan/shattered...,https://api.github.com/repos/00-Evan/shattered...,https://api.github.com/repos/00-Evan/shattered...,https://github.com/00-Evan/shattered-pixel-dun...,1292113681,I_kwDOAVceBc5NBBcR,1005,QoL Suggestion: Tiny gaps between the UI shoul...,...,,"{'total': 0, 'completed': 0, 'percent_complete...",My iPhone 12 Pro does not really have the tini...,{'url': 'https://api.github.com/repos/00-Evan/...,https://api.github.com/repos/00-Evan/shattered...,,not_planned,1.0,2022.0,[]


In [116]:
df['label_category'] = df['parsed_labels'].apply(
    lambda labels: classify_label(labels[0]) if labels else 'uncategorized'
)
df.to_csv("github_issues_with_label_category.csv", index=False)
print(" Labeled CSV saved.")

 Labeled CSV saved.


In [118]:
csv_path = '/Users/hanfuhou/Downloads/2004to2024all.csv' 
df = pd.read_csv(csv_path, low_memory=False)

df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
df['created_year'] = df['created_at'].dt.year

# ➡️ Filter to years 2020–2024
df = df[df['created_year'].between(2020, 2024)]

In [119]:
import pandas as pd
import ast
from collections import Counter

Unnamed: 0,url,repository_url,labels_url,comments_url,events_url,html_url,id,node_id,number,title,...,type,active_lock_reason,sub_issues_summary,body,reactions,timeline_url,performed_via_github_app,state_reason,score,created_year
0,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,https://github.com/0-Gixty-0/IKEA-Capstone-Gro...,2611610313,I_kwDOMvcFN86bqgLJ,67,[TASK] - Redesigning PostDetailModal,...,,,"{'total': 0, 'completed': 0, 'percent_complete...",## Description\nRedesign the PostDetailModal t...,{'url': 'https://api.github.com/repos/0-Gixty-...,https://api.github.com/repos/0-Gixty-0/IKEA-Ca...,,completed,1.0,2024.0
