In [1]:
import pandas as pd

In [2]:
# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Sort by the 'followers' column in descending order and select the top 5
top_users = users_df.sort_values(by='followers', ascending=False).head(5)

# Get the login names as a comma-separated string
top_users_logins = ", ".join(top_users['login'].tolist())
print("Top 5 users by followers:", top_users_logins)

Top 5 users by followers: getify, benawad, steveklabnik, cloudflare, jbogard


In [3]:
users_df = pd.read_csv("users.csv")

# Convert 'created_at' to datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort by 'created_at' in ascending order and select the top 5
earliest_users = users_df.sort_values(by='created_at', ascending=True).head(5)

# Get the login names as a comma-separated string
earliest_users_logins = ", ".join(earliest_users['login'].tolist())
print("Earliest 5 registered users:", earliest_users_logins)

Earliest 5 registered users: jnewland, joshknowles, hassox, dan, damon


In [4]:
# Load repositories.csv into a DataFrame
repos_df = pd.read_csv("repositories.csv")

# Filter out rows where 'license_name' is missing or empty
repos_df = repos_df[repos_df['license_name'].notna() & (repos_df['license_name'] != '')]

# Count the occurrences of each license_name
license_counts = repos_df['license_name'].value_counts()

# Get the top 3 most common licenses
top_licenses = ", ".join(license_counts.head(3).index)
print("Top 3 most popular licenses:", top_licenses)

Top 3 most popular licenses: mit, apache-2.0, other


In [None]:
# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Clean up the 'company' column
users_df['company'] = users_df['company'].str.strip().str.lstrip('@').str.upper()

# Count occurrences of each company and get the most common one
most_common_company = users_df['company'].value_counts().idxmax()

print("Company with the most developers:", most_common_company)

Company with the most developers: GOOGLE


In [6]:
# Load repositories.csv into a DataFrame
repos_df = pd.read_csv("repositories.csv")

# Filter out rows where 'language' is missing or empty
repos_df = repos_df[repos_df['language'].notna() & (repos_df['language'] != '')]

# Count occurrences of each language and get the most common one
most_common_language = repos_df['language'].value_counts().idxmax()

print("Most popular programming language:", most_common_language)

Most popular programming language: JavaScript


In [11]:
# Load users.csv and repositories.csv into DataFrames
users_df = pd.read_csv("users.csv")
repos_df = pd.read_csv("repositories.csv")

# Convert 'created_at' to datetime format and filter for users who joined after 2020
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
recent_users = users_df[users_df['created_at'] > '2020-12-31']

# Filter repositories to include only those from recent users
recent_user_logins = recent_users['login'].tolist()
recent_repos_df = repos_df[repos_df['login'].isin(recent_user_logins)]

# Filter out rows with missing or empty 'language'
recent_repos_df = recent_repos_df[recent_repos_df['language'].notna() & (recent_repos_df['language'] != '')]

# Count occurrences of each language and get the second most common one
language_counts = recent_repos_df['language'].value_counts()
second_most_common_language = language_counts.index[1]  # Get the second most common language

print("Second most popular programming language among users who joined after 2020:", second_most_common_language)

Second most popular programming language among users who joined after 2020: JavaScript


In [12]:
# Load repositories.csv into a DataFrame
repos_df = pd.read_csv("repositories.csv")

# Filter out rows where 'language' or 'stargazers_count' is missing
repos_df = repos_df[repos_df['language'].notna() & (repos_df['language'] != '') & repos_df['stargazers_count'].notna()]

# Group by 'language' and calculate the average number of stars
avg_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

# Find the language with the highest average stars
most_popular_language_by_stars = avg_stars_per_language.idxmax()
highest_avg_stars = avg_stars_per_language.max()

print("Language with the highest average number of stars per repository:", most_popular_language_by_stars)
print("Average stars:", highest_avg_stars)

Language with the highest average number of stars per repository: Fennel
Average stars: 2444.0


In [13]:
# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Calculate leader_strength as followers / (1 + following)
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and select the top 5
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Get the login names as a comma-separated string
top_leader_logins = ", ".join(top_leaders['login'].tolist())
print("Top 5 users by leader_strength:", top_leader_logins)

Top 5 users by leader_strength: getify, cloudflare, benawad, oracle, ContinuumIO


In [14]:

# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Calculate the correlation between 'followers' and 'public_repos'
correlation = users_df['followers'].corr(users_df['public_repos'])

# Print the correlation rounded to 3 decimal places
print("Correlation between followers and public repositories:", round(correlation, 3))

Correlation between followers and public repositories: 0.15


In [17]:
import pandas as pd
from scipy.stats import linregress

# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Perform linear regression on followers vs. public_repos
slope, intercept, r_value, p_value, std_err = linregress(users_df['public_repos'], users_df['followers'])

# Print the slope rounded to 3 decimal places
print("Estimated increase in followers per additional repository:", round(slope, 3))


Estimated increase in followers per additional repository: 4.09


In [26]:
import pandas as pd

# Load repositories.csv into a DataFrame
repos_df = pd.read_csv("repositories.csv")

# Filter out rows with missing values in has_projects and has_wiki
repos_df = repos_df[repos_df['has_projects'].notna() & repos_df['has_wiki'].notna()]

# Convert boolean values to numeric: true=1, false=0
repos_df['has_projects'] = repos_df['has_projects'].replace({'true': 1, 'false': 0})
repos_df['has_wiki'] = repos_df['has_wiki'].replace({'true': 1, 'false': 0})

# Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation rounded to three decimal places
print("Correlation between projects enabled and wiki enabled:", round(correlation, 3))


Correlation between projects enabled and wiki enabled: 0.276


In [30]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Filter out users with 'none' in hireable status
filtered_users_df = users_df[users_df['hireable'] != 'none']

# Calculate average following for hireable users
avg_following_hireable = filtered_users_df[filtered_users_df['hireable'] == 'true']['following'].mean()

# Calculate average following for non-hireable users (those marked as 'none')
avg_following_non_hireable = users_df[users_df['hireable'] == 'none']['following'].mean()

# Calculate the difference
difference = avg_following_hireable - avg_following_non_hireable

# Print the result rounded to three decimal places
print("Average following (hireable):", avg_following_hireable)
print("Average following (non-hireable):", avg_following_non_hireable)
print("Difference in average following:", round(difference, 3))


Average following (hireable): 185.21
Average following (non-hireable): 75.18598382749326
Difference in average following: 110.024


In [22]:
import pandas as pd
import statsmodels.api as sm

# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Filter out users without bios
users_with_bio = users_df[users_df['bio'].notna() & (users_df['bio'] != '')]

# Calculate the length of the bio in words
users_with_bio['bio_word_count'] = users_with_bio['bio'].str.split().str.len()

# Prepare the independent (X) and dependent (y) variables for regression
X = users_with_bio['bio_word_count']
y = users_with_bio['followers']

# Add a constant to the independent variable (required for statsmodels)
X = sm.add_constant(X)

# Perform the regression
model = sm.OLS(y, X).fit()

# Get the slope (coefficient) for bio_word_count
slope = model.params['bio_word_count']

# Print the result rounded to 3 decimal places
print("Regression slope of followers on bio word count:", round(slope, 3))


Regression slope of followers on bio word count: 9.195


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_word_count'] = users_with_bio['bio'].str.split().str.len()


In [23]:
import pandas as pd

# Load repositories.csv into a DataFrame
repos_df = pd.read_csv("repositories.csv")

# Convert the 'created_at' column to datetime format
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for weekend days (Saturday = 5, Sunday = 6)
repos_df['weekday'] = repos_df['created_at'].dt.weekday
weekend_repos = repos_df[repos_df['weekday'].isin([5, 6])]

# Group by user login and count the number of repositories created
user_repo_counts = weekend_repos.groupby('login').size().reset_index(name='repo_count')

# Sort by repo_count in descending order and get the top 5
top_users = user_repo_counts.sort_values(by='repo_count', ascending=False).head(5)

# Extract the login of the top 5 users
top_user_logins = top_users['login'].tolist()

# Print the logins in comma-separated format
print("Top 5 users who created the most repositories on weekends:", ', '.join(top_user_logins))


Top 5 users who created the most repositories on weekends: FellowTraveler, realityexpander, OR13, PaulBratslavsky, skeptycal


In [24]:
import pandas as pd

# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Calculate the number of hireable users with email addresses
hireable_with_email = users_df[(users_df['hireable'] == 'true') & (users_df['email'].notna() & (users_df['email'] != ''))]
total_hireable = users_df[users_df['hireable'] == 'true']

# Calculate the fraction of hireable users with email addresses
fraction_hireable_with_email = len(hireable_with_email) / len(total_hireable) if len(total_hireable) > 0 else 0

# Calculate the number of non-hireable users with email addresses
non_hireable_with_email = users_df[(users_df['hireable'] == 'false') & (users_df['email'].notna() & (users_df['email'] != ''))]
total_non_hireable = users_df[users_df['hireable'] == 'false']

# Calculate the fraction of non-hireable users with email addresses
fraction_non_hireable_with_email = len(non_hireable_with_email) / len(total_non_hireable) if len(total_non_hireable) > 0 else 0

# Calculate the difference between the two fractions
email_fraction_difference = round(fraction_hireable_with_email - fraction_non_hireable_with_email, 3)

# Print the result
print("Difference in email sharing between hireable and non-hireable users:", email_fraction_difference)


Difference in email sharing between hireable and non-hireable users: 0.52


In [25]:
import pandas as pd
from collections import Counter

# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Drop rows with missing names
users_df = users_df[users_df['name'].notna()]

# Extract surnames
users_df['surname'] = users_df['name'].str.strip().str.split().str[-1]

# Count occurrences of each surname
surname_counts = Counter(users_df['surname'])

# Find the maximum occurrence
max_count = max(surname_counts.values())

# Identify the most common surnames (handle ties)
most_common_surnames = [surname for surname, count in surname_counts.items() if count == max_count]

# Sort surnames alphabetically
most_common_surnames.sort()

# Join them in a comma-separated string
most_common_surnames_str = ', '.join(most_common_surnames)

# Print the result
print("Most common surname(s):", most_common_surnames_str)


Most common surname(s): Moore, Smith
