# Data Analysis of GitHub API scrapped dataset


In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
users_df = pd.read_csv("/content/users.csv")
repos_df = pd.read_csv("/content/repositories.csv")

## users dataset

In [9]:
users_df.head()

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,brianyu28,Brian Yu,,"Boston, MA",brian@brianyu.me,,Software developer and educator,35,13202,13,2015-11-29T07:25:29Z
1,PatrickAlphaC,Patrick Collins,CYFRIN,Boston,,,"Smart Contract Engineer, Auditor, and Educator",272,9666,43,2019-08-19T14:13:41Z
2,KeithGalli,Keith Galli,,"Boston, MA",,True,YouTube Content Creator :).,53,5680,1,2013-12-25T19:49:26Z
3,CharlesCreativeContent,Shawn Charles,AMAZON,"Boston, MA",,True,Software Engineer building Tech Communities,83,5054,1092,2020-03-11T20:10:11Z
4,timbl,Tim Berners-Lee,INRUPT,Boston MA USA,timbl@w3.org,,,18,4850,69,2011-12-11T01:28:03Z


## repositories dataset

In [10]:
repos_df.head()

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,brianyu28,brianyu28/cs50,2019-09-17T15:00:36Z,19,19,HTML,True,True,
1,brianyu28,brianyu28/scratch-to-blocks,2020-05-02T11:08:11Z,11,11,Python,True,True,
2,brianyu28,brianyu28/holyoke,2022-06-09T00:21:08Z,6,6,Swift,True,True,
3,brianyu28,brianyu28/dispatch,2017-08-26T21:43:51Z,39,39,Rust,True,True,gpl-3.0
4,brianyu28,brianyu28/multicolor,2024-04-08T22:45:21Z,5,5,TypeScript,True,True,


In [11]:
top_followers = users_df.nlargest(5, 'followers')['login'].tolist()
print("1. Top 5 users by followers:", ', '.join(top_followers))

1. Top 5 users by followers: brianyu28, PatrickAlphaC, KeithGalli, CharlesCreativeContent, timbl


In [12]:
users_df['created_at'] = pd.to_datetime(users_df['created_at'], errors='coerce')

earliest_users = users_df.nsmallest(5, 'created_at')['login'].tolist()
print("2. Earliest registered users:", ', '.join(earliest_users))

2. Earliest registered users: evan, dpickett, tel, radical, joshuaclayton


In [13]:
popular_licenses = repos_df['license_name'].value_counts().nlargest(3).index.tolist()
print("3. Top 3 licenses:", ', '.join(popular_licenses))

3. Top 3 licenses: mit, other, apache-2.0


In [14]:
majority_company = users_df['company'].mode()[0]
print("4. Majority company:", majority_company)

4. Majority company: NORTHEASTERN UNIVERSITY


In [15]:
popular_language = repos_df['language'].mode()[0]
print("5. Most popular language:", popular_language)

5. Most popular language: JavaScript


In [16]:
users_2020 = users_df[users_df['created_at'] > '2020-01-01']
popular_language_2020 = repos_df[repos_df['login'].isin(users_2020['login'])]
second_popular_language = popular_language_2020['language'].value_counts().index[1]
print("6. Second most popular language after 2020:", second_popular_language)

6. Second most popular language after 2020: Python


In [17]:
avg_stars_language = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print("7. Language with highest avg stars:", avg_stars_language)

7. Language with highest avg stars: SQL


In [18]:
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_leaders = users_df.nlargest(5, 'leader_strength')['login'].tolist()
print("8. Top 5 by leader_strength:", ', '.join(top_leaders))

8. Top 5 by leader_strength: nikomatsakis, ccoenraets, KeithGalli, rstudio, pluskid


In [19]:
correlation_followers_repos = users_df[['followers', 'public_repos']].corr().iloc[0, 1]
print(f"9. Correlation between followers and repos: {correlation_followers_repos:.3f}")

9. Correlation between followers and repos: 0.168


In [20]:
model = LinearRegression()
model.fit(users_df[['public_repos']], users_df['followers'])
slope_followers_repos = model.coef_[0]
print(f"10. Regression slope for followers on repos: {slope_followers_repos:.3f}")

10. Regression slope for followers on repos: 1.191


In [21]:
correlation_projects_wiki = repos_df[['has_projects', 'has_wiki']].corr().iloc[0, 1]
print(f"11. Correlation between projects and wiki enabled: {correlation_projects_wiki:.3f}")

11. Correlation between projects and wiki enabled: 0.324


In [22]:
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()
avg_following_non_hireable = users_df[users_df['hireable'] == False]['following'].mean()
following_diff = avg_following_hireable - avg_following_non_hireable
print(f"12. Difference in following for hireable vs. non-hireable: {following_diff:.3f}")

12. Difference in following for hireable vs. non-hireable: nan


In [23]:
users_with_bio = users_df[users_df['bio'].notnull()]
users_with_bio['bio_word_count'] = users_with_bio['bio'].str.split().apply(len)
model.fit(users_with_bio[['bio_word_count']], users_with_bio['followers'])
slope_bio_followers = model.coef_[0]
print(f"13. Regression slope for followers on bio word count: {slope_bio_followers:.3f}")

13. Regression slope for followers on bio word count: -5.303


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_word_count'] = users_with_bio['bio'].str.split().apply(len)


In [24]:
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek
weekend_repos = repos_df[repos_df['day_of_week'].isin([5, 6])]
top_weekend_users = weekend_repos['login'].value_counts().nlargest(5).index.tolist()
print("14. Top 5 users with most repos on weekends:", ', '.join(top_weekend_users))

14. Top 5 users with most repos on weekends: cameronraysmith, berquist, burtbeckwith, jimkang, johnny-rice


In [25]:
hireable_with_email = users_df[users_df['hireable'] == True]['email'].notnull().mean()
non_hireable_with_email = users_df[users_df['hireable'] == False]['email'].notnull().mean()
email_diff = hireable_with_email - non_hireable_with_email
print(f"15. Difference in email presence for hireable vs. non-hireable: {email_diff:.3f}")

15. Difference in email presence for hireable vs. non-hireable: nan


In [26]:
users_df['surname'] = users_df['name'].str.split().str[-1]
most_common_surname = users_df['surname'].mode().tolist()
print("16. Most common surname(s):", ', '.join(sorted(most_common_surname)))

16. Most common surname(s): Williams
