In [None]:
# Filter out repositories with fewer than 10 developers, that are forks, not in organizations, documentation-type repositories, or without a primary language.
# Read the CSV file, filter out repositories that do not meet the criteria, and write to a new CSV file.
import pandas as pd
import numpy as np
from pprint import pprint

file_path = "output/repos_100+_sshproto_with_size.csv"
df = pd.read_csv(file_path)


df.info()
print(df.head())

In [None]:
import matplotlib.pyplot as plt

# Count the number of repositories for each primary language
language_counts = df["primaryLanguage"].value_counts().head(20)

# Plot a bar chart
plt.figure(figsize=(10, 6))
language_counts.plot(kind="bar", color="skyblue")
plt.title("Number of Repositories by Primary Language")
plt.xlabel("Primary Language")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Check for duplicate repositories
duplicates = df[df.duplicated("url", keep=False)]
# Number of duplicates
print(duplicates.shape[0])
# Print name, url, isFork columns
# print(duplicates[["name", "url", "isFork", "stargazerCount"]])

# Remove duplicate repositories
df.drop_duplicates("url", inplace=True)

In [None]:
df[df['name'] == 'android_frameworks_base']

In [None]:
# Keep specified languages
language_set = {
    "Python",
    "JavaScript",
    "TypeScript",
    "Go",
    "Java",
    "C++",
    "C",
    "Ruby",
    "PHP",
    "C#",
    "Swift",
    "Rust",
    "Objective-C",
}

# Filter out repositories not in language_set
df = df[df["primaryLanguage"].isin(language_set)]
# Some repositories have name as nan or null, filter them directly
df = df.dropna(subset=["name"])
df.info()


In [None]:
df[df["isFork"] == True].shape[0]

In [None]:
# Filter fork repositories
non_fork_df = df[df["isFork"] == False]
print(len(non_fork_df))

# Filter repositories with fewer than 10 developers
multi_devs_df = non_fork_df[non_fork_df["assignableUsers"] >= 10]
print(len(multi_devs_df))

# Filter out repositories not in organizations
org_df = multi_devs_df[multi_devs_df["isInOrganization"] == True]

# Filter out repositories larger than 4 GB (kilobytes)
size_filtered_df = org_df[org_df["diskUsage"] <= 4 * 1024 * 1024]

print(len(org_df))
print(len(size_filtered_df))

In [None]:
org_df['primaryLanguage'].value_counts()

In [None]:
# Keep only name and url columns
output_df = size_filtered_df[["name", "url"]]
# Output to CSV
output_df.to_csv("output/100+stars_4GB-_multidev_org_lang.csv", index=False)