In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Q1.  Who are the top 5 users in Berlin with the highest number of followers? List their login in order, comma-separated

In [4]:
users = pd.read_csv('users.csv')
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,tiangolo,Sebastián Ramírez,,"Berlin, Germany",tiangolo@gmail.com,True,"Creator of FastAPI, Typer, SQLModel, Asyncer, ...",73,26459,3,2012-01-12T22:37:04Z
1,schacon,Scott Chacon,GITBUTLERAPP,"Berlin, Germany",schacon@gmail.com,False,,215,13758,26,2008-01-27T17:19:28Z
2,rwieruch,Robin Wieruch,,Berlin/Remote,,True,React & Next.js • JavaScript & TypeScript • Fr...,151,8619,30,2012-10-03T15:11:48Z
3,shuding,Shu Ding,VERCEL,Berlin,g@shud.in,False,Be curious. Read widely. Try new things. — aar...,149,6760,345,2013-02-23T07:46:30Z
4,android10,Fernando Cejas,PEPPR-IO,"Berlin, Germany",android10@fernandocejas.com,True,Quantum Engineering at @Qruise-ai. Former Dire...,79,6716,85,2012-01-20T21:35:31Z


In [5]:
users['hireable'] = users['hireable'].fillna(False).astype(bool)

In [6]:
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

tiangolo,schacon,rwieruch,shuding,android10


#### Q2. Who are the 5 earliest registered GitHub users in Berlin? List their login in ascending order of created_at, comma-separated.

In [7]:
users['created_at'] = pd.to_datetime(users['created_at'])

In [8]:
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

schacon,adamwiggins,myobie,lstoll,znarf


#### Q3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [9]:
repos = pd.read_csv('repositories.csv')
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,tiangolo,tiangolo/markdown-include-variants,2024-10-07T17:53:10Z,9,9,Python,False,False,mit
1,tiangolo,tiangolo/pydantic-sqlalchemy,2020-05-02T16:44:22Z,1174,1174,Python,True,True,mit
2,tiangolo,tiangolo/PyGithub,2024-10-27T08:52:25Z,7,7,,True,False,lgpl-3.0
3,tiangolo,tiangolo/dockerswarm.rocks,2019-01-17T13:17:52Z,1094,1094,Python,True,True,
4,tiangolo,tiangolo/uwsgi-nginx-docker,2016-02-14T14:33:08Z,648,648,Python,True,True,apache-2.0


In [10]:
repos['license_name'].value_counts().head(3)

license_name
mit           16180
apache-2.0     6562
other          4632
Name: count, dtype: int64

#### Q4. Which company do the majority of these developers work at?

In [11]:
users['company'].value_counts().head(1)

company
MICROSOFT    8
Name: count, dtype: int64

#### Q5. Which programming language is most popular among these users?

In [12]:
repos['language'].value_counts().head(1)

language
JavaScript    10457
Name: count, dtype: int64

#### Q6. Which programming language is the second most popular among users who joined after 2020?

In [13]:
users_after_2020 = users[users['created_at'] > '2020-01-01']
users_after_2020.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
37,typst,Typst,,Berlin,hello@typst.app,False,Compose papers faster: Focus on your text and ...,25,2114,0,2020-06-29 15:08:38+00:00
81,cs-MohamedAyman,Ayman M.,,"Berlin, Berlin, Germany",,True,Machine Learning Mentor and Advisor | Research...,13,1075,0,2020-01-22 13:06:45+00:00
107,zaraco,Zahra Teymouri,VECTRONIC AEROSPACE,"Berlin, Germany",zahrateymouri90@gmail.com,False,Software Developer,30,877,909,2020-03-08 12:07:10+00:00
238,chrisgrieser,Chris Grieser,TECHNICAL UNIVERSITY OF BERLIN,"Berlin, Germany",,False,Researcher in sociology & SWE,80,465,26,2020-10-22 10:50:35+00:00
280,slint-ui,Slint,,Berlin,info@slint.dev,False,"Slint - Declarative GUI for Rust, C++, and Jav...",36,393,0,2020-05-03 15:35:02+00:00


In [14]:
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

language
Python        95
JavaScript    89
HTML          30
TypeScript    29
Rust          28
Name: count, dtype: int64

#### Q7. Which language has the highest average number of stars per repository? 

In [15]:
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

Fluent 12954.0


#### Q8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [16]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

tiangolo,marijnh,vakila,alexeygrigorev,lewagon


#### Q9. What is the correlation between the number of followers and the number of public repositories among users in Berlin?

In [17]:
correlation = users['followers'].corr(users['public_repos'])
correlation

0.019092940624105874

#### Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [18]:
import csv
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)
    
    print(f"{slope:.3f}")
else:
    print("Error")

0.322


#### Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [19]:
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})
    
correlation = repos['has_projects'].corr(repos['has_wiki'])
    
print(round(correlation, 3))

0.402


#### Q12. Do hireable users follow more people than those who are not hireable?

In [20]:
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
difference

46.97489866455385

#### Q13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)

In [21]:
from sklearn.linear_model import LinearRegression
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

4.250293908158714

#### Q14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [22]:
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    
    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])  
            
            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1  

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

janpio,derhuerst,saschanaz,jamesmunns,sunsided


#### Q15. Do people who are hireable share their email addresses more often?

In [23]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

-0.004317521558900861

#### Q16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [24]:
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Schneider


In [25]:
# users = users.fillna("").astype(str)
# repos = repos.fillna("").astype(str)

# repos['has_wiki'] = repos['has_wiki'].astype(str).replace({'True': 'true', 'False': 'false'})
# repos['has_projects'] = repos['has_projects'].astype(str).replace({'True': 'true', 'False': 'false'})

# # users.to_csv('users1.csv', index=False)
# repos.to_csv('repos1.csv', index=False)