# Upload the users.csv and repositories.csv

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Q1. Who are the top 5 users in Boston with the highest number of followers? List their login in order, comma-separated.


In [2]:
users = pd.read_csv('users.csv')
users.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,brianyu28,Brian Yu,,"Boston, MA",brian@brianyu.me,False,Software developer and educator,35,13203,13,2015-11-29T07:25:29Z
1,PatrickAlphaC,Patrick Collins,CYFRIN,Boston,,False,"Smart Contract Engineer, Auditor, and Educator",272,9672,43,2019-08-19T14:13:41Z
2,KeithGalli,Keith Galli,,"Boston, MA",,True,YouTube Content Creator :).,53,5679,1,2013-12-25T19:49:26Z
3,CharlesCreativeContent,Shawn Charles,AMAZON,"Boston, MA",,True,Software Engineer building Tech Communities,83,5054,1092,2020-03-11T20:10:11Z
4,timbl,Tim Berners-Lee,INRUPT,Boston MA USA,timbl@w3.org,False,,18,4850,69,2011-12-11T01:28:03Z


In [3]:
users['hireable'] = users['hireable'].fillna(False).astype(bool)

In [4]:
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

brianyu28,PatrickAlphaC,KeithGalli,CharlesCreativeContent,timbl


#### Q2. Who are the 5 earliest registered GitHub users in Boston? List their login in ascending order of created_at, comma-separated.


In [5]:
users['created_at'] = pd.to_datetime(users['created_at'])

In [6]:
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

evan,dpickett,tel,radical,joshuaclayton


#### Q3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [7]:
repos = pd.read_csv('repositories.csv')
repos.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,brianyu28,brianyu28/cs50,2019-09-17T15:00:36Z,19,19,HTML,True,True,
1,brianyu28,brianyu28/scratch-to-blocks,2020-05-02T11:08:11Z,11,11,Python,True,True,
2,brianyu28,brianyu28/holyoke,2022-06-09T00:21:08Z,6,6,Swift,True,True,
3,brianyu28,brianyu28/dispatch,2017-08-26T21:43:51Z,39,39,Rust,True,True,gpl-3.0
4,brianyu28,brianyu28/multicolor,2024-04-08T22:45:21Z,5,5,TypeScript,True,True,


In [8]:
repos['license_name'].value_counts().head(3)

Unnamed: 0_level_0,count
license_name,Unnamed: 1_level_1
mit,9946
other,4671
apache-2.0,3567


#### Q4. Which company do the majority of these developers work at?

In [9]:
users['company'].value_counts().head(1)

Unnamed: 0_level_0,count
company,Unnamed: 1_level_1
NORTHEASTERN UNIVERSITY,16


#### Q5. Which programming language is most popular among these users?

In [10]:
repos['language'].value_counts().head(1)

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,7705


#### Q6. Which programming language is the second most popular among users who joined after 2020?

In [11]:
users_after_2020 = users[users['created_at'] >= '2020-01-01']
users_after_2020.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
3,CharlesCreativeContent,Shawn Charles,AMAZON,"Boston, MA",,True,Software Engineer building Tech Communities,83,5054,1092,2020-03-11 20:10:11+00:00
46,neuralmagic,Neural Magic,,Boston,,False,Neural Magic helps developers in accelerating ...,57,765,0,2020-07-23 00:16:47+00:00
102,Lanny-MacMillan,Lanny MacMillan,THRYV,"Boston, Ma",,False,Software Developer / Game Developer\nReact.js ...,22,393,719,2022-04-01 18:10:46+00:00
108,ibrahimgurhandev,Ibrahim Gurhan,RESILIENT CODERS,Boston MA,,False,Software Engineer at Resilient Coders | Web D...,36,363,798,2020-07-20 04:11:30+00:00
123,jessicajaniuk,Jessica Janiuk,GOOGLE,"Boston, MA",,False,This account is retired. Find Jessica at githu...,6,329,0,2020-10-12 21:49:39+00:00


In [12]:
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

Unnamed: 0_level_0,count
language,Unnamed: 1_level_1
JavaScript,98
Python,87
TypeScript,38
HTML,34
CSS,14


#### Q7. Which language has the highest average number of stars per repository?

In [13]:
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

SQL 716.0


#### Q8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [14]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

nikomatsakis,ccoenraets,KeithGalli,rstudio,pluskid


#### Q9. What is the correlation between the number of followers and the number of public repositories among users in Boston?


In [15]:
correlation = users['followers'].corr(users['public_repos'])
correlation

0.1679457994031845

#### Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [16]:
import csv
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)

    print(f"{slope:.3f}")
else:
    print("Error")

1.189


#### Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [17]:
if repos['has_projects'].dtype == 'object':
    repos['has_projects'] = repos['has_projects'].map({'true': True, 'false': False})
if repos['has_wiki'].dtype == 'object':
    repos['has_wiki'] = repos['has_wiki'].map({'true': True, 'false': False})

correlation = repos['has_projects'].astype(int).corr(repos['has_wiki'].astype(int))

print(round(correlation, 3))

0.326


In [18]:
# Convert to integers for calculation and handle NaN by filling with False (0)
repos['has_projects'] = repos['has_projects'].fillna(False).astype(int)
repos['has_wiki'] = repos['has_wiki'].fillna(False).astype(int)

# Calculate correlation
correlation = repos['has_projects'].corr(repos['has_wiki'])
print("Correlation between has_projects and has_wiki:", round(correlation, 3))


Correlation between has_projects and has_wiki: 0.326


#### Q12. Do hireable users follow more people than those who are not hireable?

In [19]:
hireable_avg_following = users[users['hireable'] == True]['following'].mean()
non_hireable_avg_following = users[users['hireable'] == False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
difference

111.50768774410442

In [20]:
# Filter users who are explicitly hireable or non-hireable
hireable_users = users[users['hireable'] == True]
non_hireable_users = users[users['hireable'] == False]

# Calculate average following for each group
avg_hireable_following = hireable_users['following'].mean()
avg_non_hireable_following = non_hireable_users['following'].mean()

# Difference
difference = avg_hireable_following - avg_non_hireable_following
print("Difference in average following:", round(difference, 3))


Difference in average following: 111.508


#### Q13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)

In [21]:
from sklearn.linear_model import LinearRegression
users_with_bio = users[(users['bio'].notna()) & (users['bio'] != '')].copy()
users_with_bio.loc[:, 'bio_len'] = users_with_bio['bio'].str.len()

X = users_with_bio['bio_len'].values.reshape(-1,1)
y = users_with_bio['followers']

lr2 = LinearRegression()
lr2.fit(X, y)
lr2.coef_[0]

-0.7992115615152502

In [22]:
from scipy.stats import linregress

# Filter out users without a bio
users_with_bio = users[users['bio'].notna()]

# Calculate bio word count
users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))

# Perform linear regression
slope, intercept, r_value, p_value, std_err = linregress(users_with_bio['bio_word_count'], users_with_bio['followers'])
print("Regression slope between bio word count and followers:", round(slope, 3))


Regression slope between bio word count and followers: -4.916


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))


In [23]:
from scipy.stats import linregress

# Filter users with non-empty bio and valid followers count
users_with_bio = users[users['bio'].notna() & (users['bio'] != '')]

# Calculate bio word count
users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))

# Perform linear regression on bio_word_count and followers
slope, intercept, r_value, p_value, std_err = linregress(users_with_bio['bio_word_count'], users_with_bio['followers'])
print("Regression slope between bio word count and followers:", round(slope, 3))


Regression slope between bio word count and followers: -4.916


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(x.split()))


#### Q14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [24]:
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])

            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

cameronraysmith,berquist,burtbeckwith,jimkang,johnny-rice


#### Q15. Do people who are hireable share their email addresses more often?

In [25]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

0.1180600187867702

In [26]:
# Filter hireable and non-hireable users
hireable_users = users[users['hireable'] == True]
non_hireable_users = users[users['hireable'] == False]

# Calculate the fraction of hireable users with email
hireable_with_email_fraction = hireable_users['email'].notna().mean()

# Calculate the fraction of non-hireable users with email
non_hireable_with_email_fraction = non_hireable_users['email'].notna().mean()

# Difference between the two fractions
email_fraction_difference = hireable_with_email_fraction - non_hireable_with_email_fraction
print("Difference in email sharing between hireable and non-hireable users:", round(email_fraction_difference, 3))


Difference in email sharing between hireable and non-hireable users: 0.118


#### Q16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [27]:
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Williams


In [28]:
# users = users.fillna("").astype(str)
# repos = repos.fillna("").astype(str)

# repos['has_wiki'] = repos['has_wiki'].astype(str).replace({'True': 'true', 'False': 'false'})
# repos['has_projects'] = repos['has_projects'].astype(str).replace({'True': 'true', 'False': 'false'})

# # users.to_csv('users1.csv', index=False)
# repos.to_csv('repos1.csv', index=False)