In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [22]:
# Reading the csv files
users = pd.read_csv('users.csv')
repos = pd.read_csv('repositories.csv')

#### Q1.  Who are the top 5 users in Toronto with the highest number of followers? List their login in order, comma-separated

In [23]:
top5 = users.sort_values(by='followers', ascending=False).head()
print(','.join(top5['login'].tolist()))

aneagoie,ZhangMYihua,susanli2016,thedaviddias,ange-yaghi


#### Q2. Who are the 5 earliest registered GitHub users in Toronto? List their login in ascending order of created_at, comma-separated.

In [24]:
users['created_at'] = pd.to_datetime(users['created_at'])

In [25]:
top_earliest = users.sort_values(by='created_at').head()
print(','.join(top_earliest['login'].tolist()))

jamesmacaulay,michaelklishin,myles,nwjsmith,cablehead


#### Q3. What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [45]:
repos['license_name'].value_counts().head(3)

mit           13617
other          4259
apache-2.0     4126
Name: license_name, dtype: int64

#### Q4. Which company do the majority of these developers work at?

In [27]:
users['company'].value_counts().head(1)

UNIVERSITY OF TORONTO    21
Name: company, dtype: int64

#### Q5. Which programming language is most popular among these users?

In [46]:
repos['language'].value_counts().head(1)

JavaScript    10772
Name: language, dtype: int64

#### Q6. Which programming language is the second most popular among users who joined after 2020?

In [29]:
# filter the users after 2020
users_after_2020 = users[users['created_at'] > '2020-01-01']

In [30]:
repos_2020 = repos[repos['login'].isin(users_after_2020['login'].tolist())]
repos_2020['language'].value_counts().head()

JavaScript    338
TypeScript    207
Python        161
HTML          122
CSS            61
Name: language, dtype: int64

#### Q7. Which language has the highest average number of stars per repository?

In [31]:
avg_stars = repos.groupby('language')['stargazers_count'].mean()
top_lang = avg_stars.idxmax()
top_stars = avg_stars.max()
print(top_lang, top_stars)

Cython 1781.0


#### Q8. Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.

In [32]:
users['leader_strength'] = users['followers'] / (1 + users['following'])
top5_lead = users.sort_values(by='leader_strength', ascending=False).head()
print(','.join(top5_lead['login'].tolist()))

aneagoie,nayuki,GrapheneOS,hlissner,rspivak


#### Q9. What is the correlation between the number of followers and the number of public repositories among users in Toronto?

In [33]:
correlation = users['followers'].corr(users['public_repos'])
correlation

0.0562449875876806

#### Q10. Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

In [34]:
import csv
followers = []
public_repos = []
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        followers_count = int(row['followers'])
        public_repos_count = int(row['public_repos'])
        followers.append(followers_count)
        public_repos.append(public_repos_count)
if len(followers) > 1 and len(public_repos) > 1:
    slope, intercept = np.polyfit(public_repos, followers, 1)

    print(f"{slope:.3f}")
else:
    print("Error")

0.256


#### Q11. Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?

In [35]:
# Convert 'has_projects' and 'has_wiki' to integers if they are booleans
repos['has_projects'] = repos['has_projects'].astype(int)
repos['has_wiki'] = repos['has_wiki'].astype(int)

# Calculate Pearson correlation between 'has_projects' and 'has_wiki'
correlation = repos['has_projects'].corr(repos['has_wiki'])

# Print the correlation rounded to 3 decimal places
print(f"Correlation between has_projects and has_wiki: {correlation:.3f}")

Correlation between has_projects and has_wiki: 0.372


#### Q12. Do hireable users follow more people than those who are not hireable?

In [48]:
hireable_avg_following = users[users['hireable']==True]['following'].mean()
non_hireable_avg_following = users[users['hireable']==False]['following'].mean()
difference = hireable_avg_following - non_hireable_avg_following
print(f"difference: {difference:.3f}")

difference: -14.888


#### Q13. Some developers write long bios. Does that help them get more followers? What's the correlation of the length of their bio (in Unicode characters) with followers? (Ignore people without bios)

In [51]:
import pandas as pd
import statsmodels.api as sm

users_df = pd.read_csv('users.csv')

users_with_bios = users_df[users_df['bio'].notna()].copy()

users_with_bios['bio_word_count'] = users_with_bios['bio'].apply(lambda x: len(x.split()))

X = users_with_bios['bio_word_count']  # Independent variable
y = users_with_bios['followers']        # Dependent variable

X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

slope = model.params['bio_word_count']

print(f'Regression slope of followers on bio word count: {slope:.3f}')

Regression slope of followers on bio word count: 8.082


#### Q14. Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [38]:
import csv
from collections import Counter
from datetime import datetime

weekend_repo_counts = Counter()

with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            created_date = datetime.fromisoformat(created_at[:-1])

            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1

top_users = weekend_repo_counts.most_common(5)

top_logins = [user[0] for user in top_users]

print(','.join(top_logins))

andyw8,QuinntyneBrown,invokethreatguy,rgrinberg,Devang-25


#### Q15. Do people who are hireable share their email addresses more often?

In [39]:
fraction_hierable = users[users['hireable'] == True]['email'].notna().mean()
fraction_non_hierable = users[users['hireable'] == False]['email'].notna().mean()
diff = fraction_hierable - fraction_non_hierable
diff

0.13402091093002594

#### Q16. Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

In [40]:
new_users = users[users['name'].notna()].copy()
new_users['surname'] = new_users['name'].str.split().str[-1].str.strip()
surname_counts = new_users['surname'].value_counts()
max_count = surname_counts.max()
common_surnames = surname_counts[surname_counts == max_count].index.tolist()
common_surnames.sort()
print(','.join(common_surnames))

Ahmed
