In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Q1.  Who are the top 5 users in Basel with the highest number of followers? List their login in order, comma-separated

In [3]:
users = pd.read_csv('users.csv')
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,yyx990803,Evan You,,Singapore,,False,"Husband, father of two, independent OSS dev. @...",195,101817,96,2010-11-28T01:05:40Z
1,halfrost,halfrost,KUBEFLOW @CNCF,"[California, Singapore, China]",i@halfrost.com,False,CS master @Stanford 💪 天道酬勤，勤能补拙。博观而约取，厚积而薄发。Go...,32,17039,361,2015-02-03T07:01:48Z
2,DIYgod,DIYgod,NATURAL SELECTION LABS,Singapore,i@diygod.me,False,写代码是热爱，写到世界充满爱！,86,14180,443,2014-07-25T09:27:56Z
3,yangshun,Yangshun Tay,GREATFRONTEND,Singapore,tay.yang.shun@gmail.com,False,"Building @greatfrontend. Formerly @meta, @face...",107,11208,287,2012-01-09T12:30:00Z
4,bytedance,Bytedance Inc.,,Singapore,,False,,295,7157,0,2013-04-15T08:03:53Z


In [4]:
users['hireable'] = users['hireable'].fillna(False).astype(bool)

In [5]:
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

yyx990803,halfrost,DIYgod,yangshun,bytedance


#### Q2. Who are the 5 earliest registered GitHub users in Basel? List their login in ascending order of created_at, comma-separated.

In [6]:
users['created_at'] = pd.to_datetime(users['created_at'])

In [7]:
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

chuyeow,choonkeat,winston,cheeaun,nowa


#### Q3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [8]:
repos = pd.read_csv('repositories.csv')
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,yyx990803,yyx990803/opensourcepledge.com,2024-10-14T08:05:24Z,8,8,,True,False,
1,yyx990803,yyx990803/launch-editor,2018-01-06T18:08:42Z,578,578,JavaScript,True,True,mit
2,yyx990803,yyx990803/vue-macros,2023-11-25T09:08:31Z,12,12,TypeScript,True,False,mit
3,yyx990803,yyx990803/core,2024-08-16T12:34:16Z,9,9,,True,False,mit
4,yyx990803,yyx990803/oxc-node-loader,2024-07-03T06:54:40Z,22,22,JavaScript,True,True,


In [9]:
repos['license_name'].value_counts().head(3)

license_name
mit           13270
apache-2.0     5581
other          4089
Name: count, dtype: int64

#### Q4. Which company do the majority of these developers work at?

In [10]:
users['company'].value_counts().head(1)

company
NATIONAL UNIVERSITY OF SINGAPORE    42
Name: count, dtype: int64

#### Q5. Which programming language is most popular among these users?

In [11]:
repos['language'].value_counts().head(1)

language
JavaScript    7532
Name: count, dtype: int64

#### Q6. Which programming language is the second most popular among users who joined after 2020?

In [12]:
users_after_2020 = users[users['created_at'] > '2020-01-01']
users_after_2020.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
8,rustdesk,RustDesk,PURSLANE LTD.,Singapore,,False,Making affordable remote desktop service for e...,8,5013,1,2020-09-21 09:13:05+00:00
15,Shib-Chain,ShibChain,,Singapore,dev@shibchain.app,False,,6,3178,0,2022-09-06 07:39:53+00:00
21,CodexploreRepo,CodeXplore,CODEXPLORE,Singapore,codexplore.channel@gmail.com,False,STAY HUNGRY. STAY FOOLISH,54,1945,7,2020-04-29 00:48:32+00:00
43,mind-network,Mind Network,,Singapore,biz@mindnetwork.xyz,False,An FHE (Fully Homomorphic Encryption) Restakin...,19,1164,0,2022-01-09 13:49:46+00:00
44,okx,OKX.com,,Singapore,support@okx.com,False,"OKX is a world-leading digital asset exchange,...",133,1160,0,2022-12-09 01:40:23+00:00


In [13]:
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

language
TypeScript    495
Python        326
JavaScript    288
Rust          171
Go            152
Name: count, dtype: int64

#### Q7. Which language has the highest average number of stars per repository? 

In [14]:
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

Inno Setup 1509.0


#### Q8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [15]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

bytedance,Jinjiang,cloudflare,JamesNK,Shib-Chain


#### Q9. What is the correlation between the number of followers and the number of public repositories among users in Basel?

In [16]:
correlation = users['followers'].corr(users['public_repos'])
correlation

0.0456753649078842

#### Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [17]:
import csv
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)
    
    print(f"{slope:.3f}")
else:
    print("Error")

1.423


#### Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [18]:
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})
    
correlation = repos['has_projects'].corr(repos['has_wiki'])
    
print(round(correlation, 3))

0.301


#### Q12. Do hireable users follow more people than those who are not hireable?

In [19]:
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
difference

219.5858546013964

#### Q13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)

In [20]:
from sklearn.linear_model import LinearRegression
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

5.45621887663618

#### Q14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [21]:
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])  
            
            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1  

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

alextanhongpin,SOF3,shantanu561993,KennyDizi,vdt


#### Q15. Do people who are hireable share their email addresses more often?

In [22]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

0.08108161413750936

#### Q16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [23]:
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Wang


In [24]:
# users = users.fillna("").astype(str)
# repos = repos.fillna("").astype(str)

# repos['has_wiki'] = repos['has_wiki'].astype(str).replace({'True': 'true', 'False': 'false'})
# repos['has_projects'] = repos['has_projects'].astype(str).replace({'True': 'true', 'False': 'false'})

# # users.to_csv('users1.csv', index=False)
# repos.to_csv('repos1.csv', index=False)