In [29]:
import requests
import pandas as pd

# Setup
token = 'Your Token'
headers = {'Authorization': f'token {token}'}
users_url = 'https://api.github.com/search/users?q=location:Delhi+followers:>100'

# Fetch Users
response = requests.get(users_url, headers=headers)
users = response.json().get('items', [])

# Extract User Data
user_data = []
for user in users:
    login = user['login']
    user_info = requests.get(f'https://api.github.com/users/{login}', headers=headers).json()
    user_data.append({
        'login': user_info.get('login', ''),
        'name': user_info.get('name', ''),
        'company': (user_info.get('company') or '').lstrip('@').strip().upper(),
        'location': user_info.get('location', ''),
        'email': user_info.get('email', ''),
        'hireable': str(user_info.get('hireable', '')).lower(),
        'bio': user_info.get('bio', ''),
        'public_repos': user_info.get('public_repos', 0),
        'followers': user_info.get('followers', 0),
        'following': user_info.get('following', 0),
        'created_at': user_info.get('created_at', '')
    })

# Save Users to CSV
users_df = pd.DataFrame(user_data)
users_df.to_csv(tpath + 'users.csv', index=False)


In [32]:
repo_data = []

for user in users:
    login = user['login']
    repos_url = f'https://api.github.com/users/{login}/repos?per_page=500'
    repos_response = requests.get(repos_url, headers=headers).json()

    for repo in repos_response:
        repo_data.append({
            'login': login,
            'full_name': repo.get('full_name', ''),
            'created_at': repo.get('created_at', ''),
            'stargazers_count': repo.get('stargazers_count', 0),
            'watchers_count': repo.get('watchers_count', 0),
            'language': repo.get('language', ''),
            'has_projects': str(repo.get('has_projects', '')).lower(),
            'has_wiki': str(repo.get('has_wiki', '')).lower(),
            'license_name': (repo.get('license') or {}).get('key', '')
        })

# Save Repositories to CSV
repos_df = pd.DataFrame(repo_data)
repos_df.to_csv(tpath+'repositories.csv', index=False)


In [33]:
# Load Data
users_df = pd.read_csv(tpath+'users.csv')
repos_df = pd.read_csv(tpath+'repositories.csv')

# Q1: Top 5 users by followers
top_users = users_df.sort_values(by='followers', ascending=False).head(5)['login'].tolist()
print(','.join(top_users))

# Q2: Earliest registered users
earliest_users = users_df.sort_values(by='created_at').head(5)['login'].tolist()
print(','.join(earliest_users))

# Q3: Top 3 licenses
top_licenses = repos_df['license_name'].value_counts().head(3).index.tolist()
print(','.join(top_licenses))

# Q4: Most common company
most_common_company = users_df['company'].mode().values[0]
print(most_common_company)

# Q5: Most popular language
most_popular_language = repos_df['language'].mode().values[0]
print(most_popular_language)

# Q6: Second most popular language for post-2020 users
users_2020 = users_df[pd.to_datetime(users_df['created_at']) > '2020-01-01']
second_popular_language = repos_df[repos_df['login'].isin(users_2020['login'])]['language'].value_counts().index[1]
print(second_popular_language)

# Q9: Correlation between followers and public repos
correlation = users_df['followers'].corr(users_df['public_repos'])
print(f'{correlation:.3f}')


amitshekhariitbhu,shradha-khapra,loveBabbar,Nakshatra05,Anuj-Kumar-Sharma
nathvarun,aviaryan,rishikksh20,manrajgrover,the-dagger
mit,apache-2.0,other
CODEDAMN
JavaScript
Jupyter Notebook
-0.441


In [34]:
# Group repositories by language and calculate the average stars
avg_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean().sort_values(ascending=False)

# Get the language with the highest average stars
highest_avg_stars_language = avg_stars_per_language.index[0]
print(highest_avg_stars_language)


Go


In [35]:
# Calculate leader strength as followers / (1 + following)
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Get top 5 users by leader strength
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)['login'].tolist()
print(','.join(top_leaders))


Anuj-Kumar-Sharma,Ignitetechnologies,shradha-khapra,loveBabbar,amitshekhariitbhu


In [36]:
from sklearn.linear_model import LinearRegression
import numpy as np

# Prepare data for regression
X = users_df[['public_repos']]
y = users_df['followers']

# Perform linear regression
reg = LinearRegression().fit(X, y)
slope = reg.coef_[0]

print(f'{slope:.3f}')




-13.278


## Question 11: Correlation Between Projects and Wiki Enabled


In [54]:
# Replace missing or invalid values with 0 (assuming no project/wiki in such cases)
repos_df['has_projects'] = repos_df['has_projects'].map({'true': 1, 'false': 0}).fillna(0).astype(int)
repos_df['has_wiki'] = repos_df['has_wiki'].map({'true': 1, 'false': 0}).fillna(0).astype(int)


# Check if there are enough non-zero entries to compute correlation
if repos_df[['has_projects', 'has_wiki']].dropna().shape[0] > 1:
    correlation_projects_wiki = repos_df['has_projects'].corr(repos_df['has_wiki'])
    print(f'{correlation_projects_wiki:.3f}')
else:
    print("Not enough data to compute correlation. Check if your dataset is correct.")

    
print("Has Projects Enabled:", repos_df['has_projects'].sum())
print("Has Wiki Enabled:", repos_df['has_wiki'].sum())


nan
Has Projects Enabled: 0
Has Wiki Enabled: 0


  c /= stddev[:, None]


# Question 12: Difference in Average Following for Hireable vs. Non-Hireable Users

In [45]:
# Calculate average following for hireable and non-hireable users
hireable_avg_following = users_df[users_df['hireable'] == 'true']['following'].mean()
non_hireable_avg_following = users_df[users_df['hireable'] != 'true']['following'].mean()

# Calculate the difference
difference = hireable_avg_following - non_hireable_avg_following

print(f'{difference:.3f}')


-1826.187


# Question 13: Regression Slope of Followers on Bio Word Count

In [46]:
# Calculate word count of bio (ignore missing bios)
users_df['bio_word_count'] = users_df['bio'].fillna('').apply(lambda x: len(x.split()))

# Prepare data for regression
X = users_df[['bio_word_count']]
y = users_df['followers']

# Perform linear regression
reg = LinearRegression().fit(X, y)
slope = reg.coef_[0]

print(f'{slope:.3f}')



137.816


## Question 14: Top 5 Users with Most Repos Created on Weekends

In [47]:
# Convert 'created_at' to datetime
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter for weekend repositories (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['created_at'].dt.weekday >= 5]

# Count repositories per user and get top 5
top_weekend_users = weekend_repos['login'].value_counts().head(5).index.tolist()
print(','.join(top_weekend_users))


AkshayAnand2002,manrajgrover,mehulmpt,ShreyaPrasad1209,saumya1singh


# 15 

In [48]:
# Calculate fraction of users with email for hireable and non-hireable users
hireable_with_email = users_df[users_df['hireable'] == 'true']['email'].notna().mean()
non_hireable_with_email = users_df[users_df['hireable'] != 'true']['email'].notna().mean()

# Calculate the difference
email_fraction_difference = hireable_with_email - non_hireable_with_email

print(f'{email_fraction_difference:.3f}')


0.201


## 16

In [49]:
# Extract surname (last word in name) and clean up missing names
users_df['surname'] = users_df['name'].fillna('').apply(lambda x: x.strip().split()[-1] if x.strip() else '')

# Find the most common surname(s)
most_common_surname = users_df['surname'].value_counts()
top_surnames = most_common_surname[most_common_surname == most_common_surname.max()].index.tolist()

# Print surnames in alphabetical order
print(','.join(sorted(top_surnames)))


Singh
