### Import necessary libraries

In [53]:
import pandas as pd
from scipy.stats import linregress
import statsmodels.api as sm
from collections import Counter
import statsmodels.api as sm

### Top 5 users in Austin

In [54]:
# Load users.csv into a DataFrame
users_df = pd.read_csv("users.csv")

# Clean the location column
users_df['location'] = users_df['location'].str.strip().str.lower()

# Check how Austin is represented
austin_users = users_df[users_df['location'].str.contains('austin')]

# Sort by followers in descending order and get the top 5 users
top_users = austin_users.sort_values(by='followers', ascending=False).head(5)

# Extract the login of these users
top_user_logins = top_users['login'].tolist()

# Print the result as a comma-separated string
print("Top 5 users in Austin by followers:", ", ".join(top_user_logins))

Top 5 users in Austin by followers: getify, benawad, steveklabnik, cloudflare, jbogard


### Company with the most developers

In [55]:
# Clean up the 'company' column
users_df['company'] = users_df['company'].str.strip().str.lstrip('@').str.upper()

# Count occurrences of each company and get the most common one
most_common_company = users_df['company'].value_counts().idxmax()

print("Company with the most developers:", most_common_company)

Company with the most developers: GOOGLE


### Most popular programming language

In [56]:
# Load repositories.csv into a DataFrame
repos_df = pd.read_csv("repositories.csv")

# Filter out rows where 'language' is missing or empty
repos_df = repos_df[repos_df['language'].notna() & (repos_df['language'] != '')]

# Count occurrences of each language and get the most common one
most_common_language = repos_df['language'].value_counts().idxmax()

print("Most popular programming language:", most_common_language)

Most popular programming language: JavaScript


### Second most popular programming language

In [57]:
# Convert 'created_at' to datetime format and filter for users who joined after 2020
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
recent_users = users_df[users_df['created_at'] > '2020-12-31']

# Filter repositories to include only those from recent users
recent_user_logins = recent_users['login'].tolist()
recent_repos_df = repos_df[repos_df['login'].isin(recent_user_logins)]

# Filter out rows with missing or empty 'language'
recent_repos_df = recent_repos_df[recent_repos_df['language'].notna() & (recent_repos_df['language'] != '')]

# Count occurrences of each language and get the second most common one
language_counts = recent_repos_df['language'].value_counts()
second_most_common_language = language_counts.index[1]  # Get the second most common language

print("Second most popular programming language among users who joined after 2020:", second_most_common_language)

Second most popular programming language among users who joined after 2020: JavaScript


### Average number of stars per repo

In [58]:
# Filter out rows where 'language' or 'stargazers_count' is missing
repos_df = repos_df[repos_df['language'].notna() & (repos_df['language'] != '') & repos_df['stargazers_count'].notna()]

# Group by 'language' and calculate the average number of stars
avg_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

# Find the language with the highest average stars
most_popular_language_by_stars = avg_stars_per_language.idxmax()
highest_avg_stars = avg_stars_per_language.max()

print("Language with the highest average number of stars per repository:", most_popular_language_by_stars)
print("Average stars:", highest_avg_stars)

Language with the highest average number of stars per repository: Fennel
Average stars: 2444.0


### Top 5 users by leader_strength

In [59]:
# Calculate leader_strength as followers / (1 + following)
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and select the top 5
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Get the login names as a comma-separated string
top_leader_logins = ", ".join(top_leaders['login'].tolist())
print("Top 5 users by leader_strength:", top_leader_logins)

Top 5 users by leader_strength: getify, cloudflare, benawad, oracle, ContinuumIO


### Correlation between followers and public repo

In [60]:
# Calculate the correlation between 'followers' and 'public_repos'
correlation = users_df['followers'].corr(users_df['public_repos'])

# Print the correlation rounded to 3 decimal places
print("Correlation between followers and public repositories:", round(correlation, 3))

Correlation between followers and public repositories: 0.15


### Estimated increase in followers per additional repo

In [61]:
# Perform linear regression on followers vs. public_repos
slope, intercept, r_value, p_value, std_err = linregress(users_df['public_repos'], users_df['followers'])

# Print the slope rounded to 3 decimal places
print("Estimated increase in followers per additional repository:", round(slope, 3))

Estimated increase in followers per additional repository: 4.09


### Correlation between projects enabled and wiki enabled

In [62]:
# Filter out rows with missing values in has_projects and has_wiki
repos_df = repos_df[repos_df['has_projects'].notna() & repos_df['has_wiki'].notna()]

# Convert boolean values to numeric: true=1, false=0
repos_df['has_projects'] = repos_df['has_projects'].replace({'true': 1, 'false': 0})
repos_df['has_wiki'] = repos_df['has_wiki'].replace({'true': 1, 'false': 0})

# Calculate the correlation between has_projects and has_wiki
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation rounded to three decimal places
print("Correlation between projects enabled and wiki enabled:", round(correlation, 3))


Correlation between projects enabled and wiki enabled: 0.316


### Average following (hireable)

In [63]:
# Filter out users with 'none' in hireable status
filtered_users_df = users_df[users_df['hireable'] != 'none']

# Calculate average following for hireable users
avg_following_hireable = filtered_users_df[filtered_users_df['hireable'] == 'true']['following'].mean()

# Calculate average following for non-hireable users (those marked as 'none')
avg_following_non_hireable = users_df[users_df['hireable'] == 'none']['following'].mean()

# Calculate the difference
difference = avg_following_hireable - avg_following_non_hireable

# Print the result rounded to three decimal places
print("Average following (hireable):", avg_following_hireable)
print("Average following (non-hireable):", avg_following_non_hireable)
print("Difference in average following:", round(difference, 3))


Average following (hireable): 185.21
Average following (non-hireable): 75.18598382749326
Difference in average following: 110.024


### Regression slope of followers on bio word count

In [64]:
# Filter out users without bios
users_with_bio = users_df[users_df['bio'].notna() & (users_df['bio'] != '')]

# Calculate the length of the bio in words
users_with_bio['bio_word_count'] = users_with_bio['bio'].str.split().str.len()

# Prepare the independent (X) and dependent (y) variables for regression
X = users_with_bio['bio_word_count']
y = users_with_bio['followers']

# Add a constant to the independent variable (required for statsmodels)
X = sm.add_constant(X)

# Perform the regression
model = sm.OLS(y, X).fit()

# Get the slope (coefficient) for bio_word_count
slope = model.params['bio_word_count']

# Print the result rounded to 3 decimal places
print("Regression slope of followers on bio word count:", round(slope, 3))

Regression slope of followers on bio word count: 9.195


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_word_count'] = users_with_bio['bio'].str.split().str.len()


### Top 5 users who created the most repos on weekends

In [65]:
# Load your dataset
df = pd.read_csv('users.csv')  # Adjust this line to your actual data source

# Convert 'created_at' to datetime
df['created_at'] = pd.to_datetime(df['created_at'])

# Extract the day of the week (0=Monday, 6=Sunday)
df['day_of_week'] = df['created_at'].dt.dayofweek

# Filter for weekends (5=Saturday, 6=Sunday)
weekend_repos = df[df['day_of_week'].isin([5, 6])]

# Count the number of repositories created by each user
repo_count = weekend_repos['login'].value_counts()

# Get the top 5 users who created the most repositories
top_5_users = repo_count.nlargest(5)

# Format the output
top_5_users_string = ', '.join(top_5_users.index)
print(f"Top 5 users who created the most repositories on weekends: {top_5_users_string}")


Top 5 users who created the most repositories on weekends: getify, GeostatsGuy, pfrazee, dhg, thirtythreeforty


### Difference in fractions of users with email

In [66]:
# Check for missing values in email and hireable columns
missing_email_count = df['email'].isnull().sum()
missing_hireable_count = df['hireable'].isnull().sum()

# Total user counts
total_users = len(df)

# Calculate fractions of users with emails based on hireable status
fraction_hireable_with_email = df[df['hireable'] == 'true']['email'].notnull().mean()
fraction_non_hireable_with_email = df[df['hireable'] != 'true']['email'].notnull().mean()

# Calculate the difference
difference = fraction_hireable_with_email - fraction_non_hireable_with_email

# Format the output to 3 decimal places
difference_formatted = round(difference, 3)
print(f"Difference in fractions of users with email: {difference_formatted}")


Difference in fractions of users with email: 0.029


### Most common surnames

In [67]:
# Drop rows with missing names
df = df.dropna(subset=['name'])

# Extract surnames by splitting names and taking the last word
df['surname'] = df['name'].str.strip().str.split().str[-1]

# Count occurrences of each surname
surname_counts = Counter(df['surname'])

# Find the maximum count
max_count = max(surname_counts.values())

# Get all surnames that have the maximum count
most_common_surnames = sorted([surname for surname, count in surname_counts.items() if count == max_count])

# Format the output
most_common_surnames_str = ', '.join(most_common_surnames)
print(f"Most common surname(s): {most_common_surnames_str}")


Most common surname(s): Moore, Smith


### Correlation between followers and repos

In [68]:
# Filter users based in Austin
austin_users_df = users_df[users_df['location'].str.contains('Austin', na=False)]

# Calculate the correlation between followers and public repositories
correlation = austin_users_df['followers'].corr(austin_users_df['public_repos'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between followers and repos: {correlation:.3f}")


Correlation between followers and repos: nan


### Regression slope of followers on repos

In [69]:
# Select relevant columns and drop missing values
df_repos_followers = df[['public_repos', 'followers']].dropna()

# Define independent (X) and dependent (y) variables
X = df_repos_followers['public_repos']
y = df_repos_followers['followers']

# Add a constant to the independent variable for intercept
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope coefficient for public_repos
slope = model.params['public_repos']
print(f"Regression slope of followers on repos: {slope:.3f}")


Regression slope of followers on repos: 4.096
