In [None]:
import requests
import csv
import time
import pandas as pd

In [None]:
# Replace with your GitHub personal access token
GITHUB_TOKEN = 'ghp_JYyxbLn42Xd82N3qw9M58Kom0Mnsql2Nksaw'
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}

# CSV file setup
USER_CSV_FILE = 'users.csv'
REPO_CSV_FILE = 'repositories.csv'

USER_FIELDS = [
    "login", "name", "company", "location", "email", "hireable",
    "bio", "public_repos", "followers", "following", "created_at"
]

REPO_FIELDS = [
    "login", "full_name", "created_at", "stargazers_count",
    "watchers_count", "language", "has_projects", "has_wiki", "license_name"
]

# API URLs
SEARCH_URL = 'https://api.github.com/search/users'
USER_URL_TEMPLATE = 'https://api.github.com/users/{username}'
REPO_URL_TEMPLATE = 'https://api.github.com/users/{username}/repos'

# Function to clean and format company names
def clean_company_name(company_name):
    if company_name:
        company_name = company_name.strip().lstrip('@').upper()
    return company_name or ""

# Function to fetch user details
def get_user_details(username):
    url = USER_URL_TEMPLATE.format(username=username)
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        user_data = response.json()
        # Ensure user_data is not None before accessing fields
        return {
            "login": user_data.get("login", ""),
            "name": user_data.get("name", ""),
            "company": clean_company_name(user_data.get("company", "")),
            "location": user_data.get("location", ""),
            "email": user_data.get("email", ""),
            "hireable": user_data.get("hireable", ""),
            "bio": user_data.get("bio", ""),
            "public_repos": user_data.get("public_repos", 0),
            "followers": user_data.get("followers", 0),
            "following": user_data.get("following", 0),
            "created_at": user_data.get("created_at", "")
        }
    else:
        print(f"Failed to fetch data for user: {username} - Status Code: {response.status_code}")
        return None  # Return None if request failed

# Function to fetch repositories for a user
def get_user_repositories(username):
    url = REPO_URL_TEMPLATE.format(username=username)
    repositories = []
    page = 1
    per_page = 100  # Maximum number of repositories per page
    max_repos = 500  # Limit to the most recent 500 repositories

    while len(repositories) < max_repos:
        params = {
            'per_page': per_page,
            'page': page,
            'sort': 'pushed'  # Fetch repositories sorted by the last pushed date
        }
        response = requests.get(url, headers=HEADERS, params=params)

        if response.status_code != 200:
            print(f"Failed to fetch repositories for user: {username} - Status Code: {response.status_code}")
            break

        repo_data = response.json()

        if not repo_data:
            break  # Exit loop if no more repositories

        for repo in repo_data:
            repositories.append({
                "login": username,
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", 0),
                "watchers_count": repo.get("watchers_count", 0),
                "language": repo.get("language", ""),
                "has_projects": repo.get("has_projects", False),
                "has_wiki": repo.get("has_wiki", False),
                "license_name": repo.get("license", {}).get("name", "") if repo.get("license") else ""
            })

            # Stop if we've reached the limit of 500 repositories
            if len(repositories) >= max_repos:
                break

        page += 1
        time.sleep(1)  # Avoid hitting the rate limit

    return repositories[:max_repos]  # Return only up to the max_repos


# Main function to get users and their repositories in Melbourne
def fetch_users_and_repositories():
    users = []  # List to hold user details
    all_repositories = []  # List to hold all repositories
    page = 1
    per_page = 30  # Number of users to fetch per page

    # Fetch users in Melbourne
    while True:
        params = {
            'q': 'location:Melbourne followers:>100',
            'per_page': per_page,
            'page': page
        }
        response = requests.get(SEARCH_URL, headers=HEADERS, params=params)

        if response.status_code != 200:
            print(f"Error fetching users - Status Code: {response.status_code}")
            break

        data = response.json()
        items = data.get('items', [])

        if not items:
            break  # Exit loop if no more users

        for item in items:
            user_detail = get_user_details(item.get('login'))
            if user_detail:
                users.append(user_detail)
                repositories = get_user_repositories(item.get('login'))
                all_repositories.extend(repositories)

        page += 1
        time.sleep(1)  # Avoid hitting the rate limit

    # Write user data to CSV
    with open(USER_CSV_FILE, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=USER_FIELDS)
        writer.writeheader()
        for user in users:
            writer.writerow(user)

    print(f"User data saved to {USER_CSV_FILE}")

    # Write repository data to CSV
    with open(REPO_CSV_FILE, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=REPO_FIELDS)
        writer.writeheader()
        for repo in all_repositories:
            writer.writerow(repo)

    print(f"Repository data saved to {REPO_CSV_FILE}")

# Run the function
fetch_users_and_repositories()

User data saved to users.csv
Repository data saved to repositories.csv


In [None]:
# users.csv - 332
# repository.csv - 28987

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import statsmodels.api as sm

# Load the CSV file
csv_file = '/content/drive/MyDrive/users.csv'  # Ensure this path is correct

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Check the first few rows and the data types of the DataFrame
print("DataFrame Overview:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())

# Filter out users without bios
df = df[df['bio'].notnull()]

# Calculate the length of each bio in words
df['bio_word_count'] = df['bio'].str.split().str.len()

# Prepare the independent variable (X) and dependent variable (y)
X = df['bio_word_count']
y = df['followers']  # Adjust the column name as per your dataset

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)

# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient of the bio_word_count)
slope = model.params['bio_word_count']

# Print the regression slope rounded to three decimal places
print(f"\nRegression slope of followers on bio word count:,",slope)

DataFrame Overview:
           login               name company              location  \
0  mosh-hamedani      Mosh Hamedani     NaN  Melbourne, Australia   
1      TheCherno      Yan Chernikov     NaN  Melbourne, Australia   
2        haileys  Hailey Somerville     NaN     Naarm / Melbourne   
3       rstacruz     Rico Sta. Cruz     NaN         Melbourne, AU   
4  jesseduffield     Jesse Duffield     NaN             Melbourne   

                      email hireable  \
0                       NaN      NaN   
1                       NaN      NaN   
2         hailey@hailey.lol      NaN   
3                       NaN     True   
4  jessedduffield@gmail.com      NaN   

                                                 bio  public_repos  followers  \
0  I train software engineers that companies love...            27      12774   
1                                                NaN            35      11226   
2                                                NaN           373       9391   


In [None]:
users_df = pd.read_csv('/content/drive/MyDrive/users.csv')
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
users_df

Unnamed: 0,login,name,company,location,email,hireable,bio,public_repos,followers,following,created_at
0,mosh-hamedani,Mosh Hamedani,,"Melbourne, Australia",,,I train software engineers that companies love...,27,12774,1,2013-09-12 04:38:33+00:00
1,TheCherno,Yan Chernikov,,"Melbourne, Australia",,,,35,11226,4,2012-02-26 08:39:00+00:00
2,haileys,Hailey Somerville,,Naarm / Melbourne,hailey@hailey.lol,,,373,9391,104,2010-01-09 14:49:23+00:00
3,rstacruz,Rico Sta. Cruz,,"Melbourne, AU",,True,@rstacruz@hachyderm.io,561,6433,25,2009-04-16 09:46:24+00:00
4,jesseduffield,Jesse Duffield,,Melbourne,jessedduffield@gmail.com,,"Creator of lazygit, lazydocker, horcrux, and t...",60,4575,4,2014-08-15 09:18:18+00:00
...,...,...,...,...,...,...,...,...,...,...,...
327,mdub,Mike Williams,,"Melbourne, AU",mdub@dogbiscuit.org,,,102,101,15,2008-02-22 11:02:25+00:00
328,3xocyte,Matt Bush,ATLASSIAN,"Melbourne, Australia",,,Red teamer,5,101,4,2015-09-10 12:08:41+00:00
329,Gizmotronn,Liam Arbuckle,"SIGNAL-K, @DESCI-LABS, NEW MILLENNIUM COMMITTE...","Melbourne, Australia",liam@skinetics.tech,True,I'm building interactive worlds to help revolu...,108,101,265,2017-09-10 03:09:55+00:00
330,TheDen,Denis Khoshaba,,Melbourne,github@theden.sh,True,,115,101,27,2011-06-17 08:30:57+00:00


In [None]:
repositories_df = pd.read_csv('/content/drive/MyDrive/repositories.csv')
repositories_df['created_at'] = pd.to_datetime(repositories_df['created_at'])
repositories_df

Unnamed: 0,login,full_name,created_at,stargazers_count,watchers_count,language,has_projects,has_wiki,license_name
0,mosh-hamedani,mosh-hamedani/python-projects-for-beginners,2024-08-29 17:14:21+00:00,71,71,Python,True,False,
1,mosh-hamedani,mosh-hamedani/vidly-mvc-5,2016-03-31 06:37:00+00:00,956,956,JavaScript,True,True,
2,mosh-hamedani,mosh-hamedani/react-course-part2-starter,2023-04-17 12:37:33+00:00,63,63,TypeScript,True,True,
3,mosh-hamedani,mosh-hamedani/python-supplementary-materials,2024-07-16 22:44:04+00:00,22,22,,True,True,
4,mosh-hamedani,mosh-hamedani/react-testing-starter,2024-01-29 20:30:40+00:00,60,60,TypeScript,True,False,
...,...,...,...,...,...,...,...,...,...
28982,yinchuandong,yinchuandong/android_device_xiaomi_mione_plus,2014-01-08 03:35:21+00:00,0,0,Shell,True,False,
28983,yinchuandong,yinchuandong/javascript,2013-08-02 03:35:15+00:00,0,0,JavaScript,True,True,
28984,yinchuandong,yinchuandong/Markdown-Chinese-Demo,2014-10-21 00:59:52+00:00,0,0,,True,True,
28985,yinchuandong,yinchuandong/chnroutes,2014-08-08 02:15:35+00:00,0,0,,True,False,


In [None]:
# Replace "TRUE" with "true" and "FALSE" with "false" in both DataFrames
users_df.replace({"TRUE": "true", "FALSE": "false"}, inplace=True)
repositories_df.replace({"TRUE": "true", "FALSE": "false"}, inplace=True)

# Define the file names to save the updated CSVs
users_file_name = "updated_users.csv"
repositories_file_name = "updated_repositories.csv"

# Save the updated DataFrames to CSV files
users_df.to_csv(users_file_name, index=False)
repositories_df.to_csv(repositories_file_name, index=False)

# Code to download the files
from google.colab import files
files.download(users_file_name)
files.download(repositories_file_name)

print("Files have been saved and are ready for download.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Files have been saved and are ready for download.


In [None]:
# Q1
# Sort by 'followers' in descending order and get the top 5
top_users = users_df.sort_values(by="followers", ascending=False).head(5)

# Extract 'login' values and join them with commas
top_user_logins = ",".join(top_users['login'])

print(top_user_logins)

mosh-hamedani,TheCherno,haileys,rstacruz,jesseduffield


In [None]:
# Q2
earliest_users = users_df.sort_values(by="created_at", ascending=True).head(5)

# Extract 'login' values and join them with commas
earliest_user_logins = ",".join(earliest_users['login'])

print(earliest_user_logins)

toolmantim,crafterm,dgoodlad,Sutto,mdub


In [None]:
# Q3
licenses = repositories_df['license_name'].dropna()

# Get the 3 most common licenses
top_licenses = licenses.value_counts().head(3)

# Join the top license names with commas
top_license_names = ",".join(top_licenses.index)

print(top_license_names)

MIT License,Other,Apache License 2.0


In [None]:
# Q4
# Filter out missing or empty company names
companies = users_df['company'].dropna().replace("", float("NaN")).dropna()

# Find the most common company
top_company = companies.value_counts().idxmax()

print(top_company)

MONASH UNIVERSITY


In [None]:
# Q5
# Filter out missing or empty language values
languages = repositories_df['language'].dropna().replace("", float("NaN")).dropna()

# Find the most common language
top_language = languages.value_counts().idxmax()

print("Most popular programming language:", top_language)

Most popular programming language: JavaScript


In [None]:
# Q6
recent_users = users_df[users_df['created_at'] > '2020-01-01']

# Filter repositories for the recent users
recent_user_repos = repositories_df[repositories_df['login'].isin(recent_users['login'])]

# Filter out missing or empty language values
languages = recent_user_repos['language'].dropna().replace("", float("NaN")).dropna()

# Get the second most common language
second_most_common_language = languages.value_counts().nlargest(2).idxmin()

print("Second most popular programming language among users who joined after 2020:", second_most_common_language)

Second most popular programming language among users who joined after 2020: JavaScript


In [None]:
# Q7
# Filter out missing or empty language values
filtered_repos = repositories_df.dropna(subset=['language', 'stargazers_count'])

# Group by language and calculate the average number of stars
average_stars = filtered_repos.groupby('language')['stargazers_count'].mean()

# Identify the language with the highest average number of stars
highest_avg_language = average_stars.idxmax()

print("Language with the highest average number of stars per repository:", highest_avg_language)

Language with the highest average number of stars per repository: D


In [None]:
# Q8
# Calculate leader_strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and get the top 5
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Extract 'login' values and join them with commas
top_leader_logins = ",".join(top_leaders['login'])

print(top_leader_logins)

mosh-hamedani,binarythistle,TheCherno,TuPayChain,rogerclarkmelbourne


In [None]:
# Q9
# Calculate the correlation between 'followers' and 'public_repos'
correlation = users_df['followers'].corr(users_df['public_repos'])

print("Correlation between the number of followers and the number of public repositories:", correlation)

Correlation between the number of followers and the number of public repositories: 0.18805192159828962


In [None]:
# Q10
# Perform linear regression

from scipy import stats

slope, intercept, r_value, p_value, std_err = stats.linregress(users_df['public_repos'], users_df['followers'])

# Print the slope, rounded to 3 decimal places
print("Regression slope of followers on repos:", round(slope, 3))

Regression slope of followers on repos: 2.244


In [None]:
# Q11
# Convert boolean columns to integers (1 for True, 0 for False)
repositories_df['has_projects'] = repositories_df['has_projects'].astype(int)
repositories_df['has_wiki'] = repositories_df['has_wiki'].astype(int)

# Drop rows with missing values in either column
repositories_df = repositories_df.dropna(subset=['has_projects', 'has_wiki'])

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = repositories_df['has_wiki'].corr(repositories_df['has_projects'])

# Print the correlation rounded to 3 decimal places
print("Correlation between projects and wiki enabled:",correlation)

Correlation between projects and wiki enabled: 0.37979129526231137


In [None]:
# Q12
# Calculate the average 'following' for hireable users (hireable=True)
avg_following_hireable = users_df[users_df['hireable'] == True]['following'].mean()

# Calculate the average 'following' for non-hireable users (hireable is NaN)
avg_following_non_hireable = users_df[users_df['hireable'].isna()]['following'].mean()

# Print the averages rounded to 3 decimal places
print("Average following for hireable users:", avg_following_hireable)
print("Average following for non-hireable users:", avg_following_non_hireable)

# Calculate the difference
difference = avg_following_hireable - avg_following_non_hireable

# Print the difference rounded to 3 decimal places
print("Difference in average following:", difference)

Average following for hireable users: 97.23333333333333
Average following for non-hireable users: 143.25619834710744
Difference in average following: -46.02286501377411


In [None]:
# Q13
from scipy import stats
import re

# Filter out rows with no bio
users_with_bio = users_df.dropna(subset=['bio'])

# Calculate word count for each bio using Unicode-aware word counting
users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(re.findall(r'\w+', x)))

# Perform linear regression on bio_word_count and followers
slope, intercept, r_value, p_value, std_err = stats.linregress(users_with_bio['bio_word_count'], users_with_bio['followers'])

# Print the slope, rounded to 3 decimal places
print("Regression slope of followers on bio word count:",slope)

Regression slope of followers on bio word count: 6.47897661825488


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bio['bio_word_count'] = users_with_bio['bio'].apply(lambda x: len(re.findall(r'\w+', x)))


In [None]:
# Q14
# Convert 'created_at' to datetime format
repositories_df['created_at'] = pd.to_datetime(repositories_df['created_at'], utc=True)

# Filter repositories created on weekends (Saturday=5, Sunday=6)
repositories_df['created_day'] = repositories_df['created_at'].dt.dayofweek
weekend_repos = repositories_df[repositories_df['created_day'].isin([5, 6])]

# Count repositories created on weekends for each user
weekend_counts = weekend_repos['login'].value_counts().head(5)

# List the top 5 users' logins, comma-separated
top_5_users = ','.join(weekend_counts.index)
print("Top 5 users with most repositories created on weekends:", top_5_users)

Top 5 users with most repositories created on weekends: wolfeidau,karkranikhil,roachhd,plutext,rstacruz


In [None]:
# Q15
# Calculate the fraction of hireable users with an email address (where hireable=True)
hireable_with_email_fraction = users_df[(users_df['hireable'] == True) & (users_df['email'].notna())].shape[0] / users_df[users_df['hireable'] == True].shape[0]

# Calculate the fraction of non-hireable users with an email address (where hireable is NaN)
non_hireable_with_email_fraction = users_df[(users_df['hireable'].isna()) & (users_df['email'].notna())].shape[0] / users_df[users_df['hireable'].isna()].shape[0]

# Calculate the difference
difference = hireable_with_email_fraction - non_hireable_with_email_fraction

# Print the difference rounded to 3 decimal places
print("Difference in fraction of users with email (hireable vs non-hireable):",difference)

Difference in fraction of users with email (hireable vs non-hireable): 0.05500459136822772


In [None]:
# Q16
users_with_names = users_df.dropna(subset=['name'])

# Extract surnames (last word in name after trimming whitespace)
users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]

# Count occurrences of each surname
surname_counts = users_with_names['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Find the most common surnames (in case of a tie, list alphabetically)
most_common_surnames = sorted(surname_counts[surname_counts == max_count].index)

# Print the answer in the required format
print("Most common surname(s):", ", ".join(most_common_surnames))
print("Number of users with the most common surname:", max_count)

Most common surname(s): Jackson, Wang
Number of users with the most common surname: 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]


In [None]:
users_with_names['surname'].value_counts()

Unnamed: 0_level_0,count
surname,Unnamed: 1_level_1
Wang,3
Jackson,3
Miller,2
Wu,2
Xu,2
...,...
Malseed,1
Sbarski,1
Camera,1
Donald,1


In [None]:
users_df['name'].str.split()

Unnamed: 0,name
0,"[Mosh, Hamedani]"
1,"[Yan, Chernikov]"
2,"[Hailey, Somerville]"
3,"[Rico, Sta., Cruz]"
4,"[Jesse, Duffield]"
...,...
327,"[Mike, Williams]"
328,"[Matt, Bush]"
329,"[Liam, Arbuckle]"
330,"[Denis, Khoshaba]"


In [None]:
users_df['name']

Unnamed: 0,name
0,Mosh Hamedani
1,Yan Chernikov
2,Hailey Somerville
3,Rico Sta. Cruz
4,Jesse Duffield
...,...
327,Mike Williams
328,Matt Bush
329,Liam Arbuckle
330,Denis Khoshaba
