In [33]:
import pandas as pd
from datetime import datetime
import numpy as np
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression


users_df = pd.read_csv("users.csv")
repos_df = pd.read_csv("repositories.csv")


users_df['created_at'] = pd.to_datetime(users_df['created_at'])
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# 1. Top 5 users in Dublin with the highest followers
top_5_followers = users_df.nlargest(5, 'followers')['login'].tolist()

# 2. 5 earliest registered GitHub users in Dublin
earliest_5_users = users_df.nsmallest(5, 'created_at')['login'].tolist()

# 3. Top 3 most popular licenses
top_3_licenses = repos_df['license_name'].value_counts().head(3).index.tolist()

# 4. Company with majority of developers
top_company = users_df['company'].value_counts().idxmax()

# 5. Most popular programming language
top_language = repos_df['language'].value_counts().idxmax()

# 6. Second most popular language among users who joined after 2020
second_language_post_2020 = repos_df[
    repos_df['login'].isin(users_df[users_df['created_at'] > '2020-01-01']['login'])
]['language'].value_counts().index[1]

# 7. Language with highest average stars per repo
highest_avg_star_language = repos_df.groupby('language')['stargazers_count'].mean().idxmax()

# 8. Top 5 users by leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.nlargest(5, 'leader_strength')['login'].tolist()

# 9. Correlation between followers and public repos
followers_repos_corr = users_df['followers'].corr(users_df['public_repos']).round(3)

# 10. Regression slope of followers on repos
X = users_df[['public_repos']]
y = users_df['followers']
reg_model = LinearRegression().fit(X, y)
slope_followers_repos = round(reg_model.coef_[0], 3)

# 11. Correlation between projects and wikis enabled
projects_wikis_corr = repos_df['has_projects'].corr(repos_df['has_wiki']).round(3)

# 12. Difference in average following for hireable users vs. non-hireable
hireable_following_diff = (users_df[users_df['hireable'] == True]['following'].mean() -
                           users_df[users_df['hireable'] == False]['following'].mean()).round(3)

# 13. Regression slope of followers on bio word count
users_with_bios = users_df.dropna(subset=['bio']).copy()
users_with_bios['bio_word_count'] = users_with_bios['bio'].str.split().apply(len)
bio_followers_slope = LinearRegression().fit(users_with_bios[['bio_word_count']], users_with_bios['followers']).coef_[0].round(3)

# 14. Top 5 users by repos created on weekends
repos_df['is_weekend'] = repos_df['created_at'].dt.dayofweek >= 5
top_5_weekend_repos = repos_df[repos_df['is_weekend']].groupby('login').size().nlargest(5).index.tolist()

# 15. Fraction of users with email when hireable vs. non-hireable
email_fraction_hireable = users_df[users_df['hireable'] == True]['email'].notna().mean()
email_fraction_non_hireable = users_df[users_df['hireable'] == False]['email'].notna().mean()
email_hireable_diff = round(email_fraction_hireable - email_fraction_non_hireable, 3)

# 16. Most common surname

new_users = users_df[users_df['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()

In [35]:
print("1. Top 5 users by followers:",",".join(top_5_followers))
print("2. Earliest 5 users:",",".join(earliest_5_users))
print("3. Top 3 licenses:",",".join(top_3_licenses))
print("4. Company with most developers:", top_company)
print("5. Most popular language:", top_language)
print("6. Second most popular language post-2020:", second_language_post_2020)
print("7. Language with highest avg stars per repo:", highest_avg_star_language)
print("8. Top 5 by leader strength:",",".join(top_5_leader_strength))
print("9. Correlation followers/repos:", followers_repos_corr)
print("10. Slope followers on repos:", slope_followers_repos)
print("11. Correlation projects and wiki:", projects_wikis_corr)
print("12. Avg following diff (hireable - non-hireable):", hireable_following_diff)
print("13. Slope followers on bio word count:", bio_followers_slope)
print("14. Top 5 by weekend repos:",",".join(top_5_weekend_repos))
print("15. Email sharing diff (hireable - non-hireable):", email_hireable_diff)
print("16. Most common surname(s):",",".join(common_surnames))


1. Top 5 users by followers: orta,jeromeetienne,jonataslaw,steventroughtonsmith,axic
2. Earliest 5 users: paulca,adrian,GavinJoyce,amir,ciaranlee
3. Top 3 licenses: mit,apache-2.0,other
4. Company with most developers: MICROSOFT
5. Most popular language: JavaScript
6. Second most popular language post-2020: JavaScript
7. Language with highest avg stars per repo: MDX
8. Top 5 by leader strength: flaviohenriquealmeida,zalando,AnikSarker,wix,CardinalHealth
9. Correlation followers/repos: 0.55
10. Slope followers on repos: 2.781
11. Correlation projects and wiki: 0.315
12. Avg following diff (hireable - non-hireable): 48.668
13. Slope followers on bio word count: 7.531
14. Top 5 by weekend repos: orta,joshuacassidy,No9,wafuwafu13,lmammino
15. Email sharing diff (hireable - non-hireable): 0.111
16. Most common surname(s): Chen,Kenny,O'Sullivan,Quinn
