In [34]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from collections import Counter

In [2]:
users_df = pd.read_csv('users.csv')
repo_df = pd.read_csv('repositories.csv')

In [42]:
#1

top_users = users_df.sort_values(by='followers', ascending=False).head(5)
top_user_logins = ','.join(top_users['login'].tolist())
top_user_logins

'tarsius,aalmiray,marcoroth,klmr,MrNeRF'

In [43]:
#2

earliest_users = users_df.sort_values('created_at', ascending=True).head(5)
earliest_user_logins = ','.join(earliest_users['login'].tolist())
earliest_user_logins

'bennyzen,aalmiray,pvillega,tarsius,amaunz'

In [44]:
#3

top_lic = repo_df['license_name'].dropna().value_counts().head(3).index.tolist()
top_lic=','.join(top_lic)
top_lic

'MIT License,Apache License 2.0,Other'

In [45]:
#4

most_common_company = users_df['company'].value_counts().idxmax()
most_common_company

'UNIVERSITY OF BASEL'

In [46]:
#5

most_popular_lang = repo_df['language'].value_counts().idxmax()
most_popular_lang

'JavaScript'

In [47]:
#6

users_df['created_at'] = pd.to_datetime(users_df['created_at'])

recent_users = users_df[users_df['created_at'] > '2020-01-01']
recent_repos = repo_df[repo_df['login'].isin(recent_users['login'])]
language_counts = recent_repos['language'].value_counts()
second_most_popular = language_counts.nlargest(2).index[1]
second_most_popular

'PHP'

In [48]:
#7

language_avg_stars = repo_df.groupby('language')['stargazers_count'].mean().idxmax()
language_avg_stars

'PureScript'

In [49]:
#8

users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leaders = users_df.sort_values('leader_strength', ascending=False).head(5)['login'].tolist()
top_5_leaders = ','.join(top_5_leaders)
top_5_leaders

'dpryan79,wasserth,ravage84,elanmart,quadbiolab'

In [53]:
#9

followers_repos_corr = users_df['followers'].corr(users_df['public_repos'])
print(f"Correlation between followers and repos: {followers_repos_corr:.3f}")

Correlation between followers and repos: 0.345


In [54]:
#10

model = LinearRegression()
model.fit(users_df[['public_repos']], users_df['followers'])
f_per_repo = model.coef_[0]
print(f"Regression slope of followers on repos: {f_per_repo:.3f}")

Regression slope of followers on repos: 0.674


In [22]:
#11

repo_df['has_projects'] = repo_df['has_projects'].astype(str).str.lower() == 'true'
repo_df['has_wiki'] = repo_df['has_wiki'].astype(str).str.lower() == 'true'

correlation = repo_df['has_projects'].astype(int).corr(repo_df['has_wiki'].astype(int))

print(f"Correlation between projects enabled and wiki enabled: {correlation:.3f}")

Correlation between projects enabled and wiki enabled: 0.262


In [41]:
#12

users_df['hireable'] = users_df['hireable'].replace({'true': True}).fillna(False).astype(bool)
mean_following = users_df.groupby('hireable')['following'].mean()
difference = mean_following[True] - mean_following[False]

print(mean_following)
print(f"The difference in mean following between hireable True and False is: {difference}")

hireable
False    30.056537
True     75.985075
Name: following, dtype: float64
The difference in mean following between hireable True and False is: 45.92853752439217


In [39]:
#13

users_with_bios = users_df[users_df['bio'].notna()].copy()
users_with_bios['bio_length'] = users_with_bios['bio'].str.split().str.len()

X = users_with_bios['bio_length']
y = users_with_bios['followers']

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
slope = model.params['bio_length']

print(f"Regression slope of followers on bio word count: {slope:.3f}")


Regression slope of followers on bio word count: 2.342


In [38]:
#14

def is_weekend(date_str):
    date = pd.to_datetime(date_str)
    return date.weekday() >= 5  # Saturday (5) or Sunday (6)

repo_df['weekend'] = repo_df['created_at'].apply(is_weekend)
weekend_logins = repo_df[repo_df['weekend']]['login'].value_counts().head(5).index

print(','.join(weekend_logins))

dpryan79,ioolkos,syzer,maysam,pvillega


In [55]:
#15
users = users_df.copy()
users['hireable'] = users['hireable'].fillna(False).astype(bool)

non_hireable_users  = users[users['hireable'] == False]
hireable_users = users[users['hireable'] == True]

fraction_hireable_with_email = hireable_users['email'].notna().sum() / hireable_users.shape[0] if hireable_users.shape[0] > 0 else 0
fraction_non_hireable_with_email = non_hireable_users['email'].notna().sum() / non_hireable_users.shape[0] if non_hireable_users.shape[0] > 0 else 0
difference = fraction_hireable_with_email - fraction_non_hireable_with_email

print(f"Difference between fraction of users with email when hireable=true and for the rest: {difference}")

Difference between fraction of users with email when hireable=true and for the rest: 0.06729602869046991


In [36]:
#16

surnames = users_df['name'].dropna().str.strip().str.split().str[-1]
surname_counts = Counter(surnames)
max_count = max(surname_counts.values())
most_common_surnames = sorted([surname for surname, count in surname_counts.items() if count == max_count])

print("Most common surname(s):", ','.join(most_common_surnames))

Most common surname(s): Arnold,Brand,Christensen,Fink,GmbH,Group,Guggisberg,Landolt,Roth,Tan
