In [None]:
#Scraping of users.csv and repositories.csv datasets for analysis
#Basel:10 (Question)
import requests
import csv

#GitHub Token
token = '####'

def get_detailed_user_info(username, token):
    url = f"https://api.github.com/users/{username}"
    headers = {
        "Authorization": f"token {token}"
    }
    response = requests.get(url, headers=headers)
    return response.json()

def clean_company_name(company):
    if company:
        company = company.strip()
        if company.startswith('@'):
            company = company[1:]
        company = company.upper()
    return company

def get_users_in_basel_with_over_10_followers(token):
    search_url = "https://api.github.com/search/users"
    headers = {
        "Authorization": f"token {token}"
    }
    params = {
        "q": "location:Basel followers:>10",
        "per_page": 100,
        "page": 1
    }

    users = []
    while True:
        response = requests.get(search_url, headers=headers, params=params)
        response_data = response.json()
        users.extend(response_data.get('items', []))

        #Existence of another page
        if 'next' not in response.links:
            break

        params['page'] += 1

    detailed_users = []
    for user in users:
        user_info = get_detailed_user_info(user['login'], token)
        user_info['company'] = clean_company_name(user_info.get('company'))
        detailed_users.append(user_info)

    return detailed_users



detailed_users = get_users_in_basel_with_over_10_followers(token)

#Users Headers
headers = ["login", "name", "company", "location", "email", "hireable",
           "bio", "public_repos", "followers", "following", "created_at"]

with open('users.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    writer.writeheader()
    for user in detailed_users:
        writer.writerow({
            "login": user.get("login", ""),
            "name": user.get("name", ""),
            "company": user.get("company", ""),
            "location": user.get("location", ""),
            "email": user.get("email", ""),
            "hireable": str(user.get("hireable", "")).lower(),
            "bio": user.get("bio", ""),
            "public_repos": user.get("public_repos", ""),
            "followers": user.get("followers", ""),
            "following": user.get("following", ""),
            "created_at": user.get("created_at", "")
        })

def get_user_repositories(username, token):
    url = f"https://api.github.com/users/{username}/repos"
    headers = {
        "Authorization": f"token {token}"
    }
    params = {
        "sort": "pushed",
        "per_page": 100,
        "page": 1
    }

    repositories = []
    while True:
        response = requests.get(url, headers=headers, params=params)
        response_data = response.json()
        repositories.extend(response_data)

        #Existence of another page
        if len(response_data) < 100 or len(repositories) >= 500:
            break

        params['page'] += 1

    return repositories[:500]

#Read users from users.csv
users = []
with open('users.csv', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        users.append(row['login'])

#Repo Headers
headers = ["login", "full_name", "created_at", "stargazers_count",
           "watchers_count", "language", "has_projects", "has_wiki", "license_name"]

with open('repositories.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=headers)
    writer.writeheader()

    for username in users:
        repos = get_user_repositories(username, token)
        for repo in repos:
            #Handling Null License
            license_name = None
            if repo.get("license") is not None:
                license_name = repo["license"].get("name")
            writer.writerow({
                "login": username,
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", ""),
                "watchers_count": repo.get("watchers_count", ""),
                "language": repo.get("language", ""),
                "has_projects": str(repo.get("has_projects", "")).lower(),
                "has_wiki": str(repo.get("has_wiki", "")).lower(),
                "license_name": license_name or ""
            })


In [None]:
#1
import pandas as pd
#Read
df = pd.read_csv('users.csv')
#Sort
sorted_df = df.sort_values(by='followers', ascending=False)
#Top5
top_5_logins = sorted_df['login'].head(5)
print(','.join(top_5_logins))


tarsius,aalmiray,marcoroth,klmr,MrNeRF


In [None]:
#2
import pandas as pd
#Read
df = pd.read_csv('users.csv')
df['created_at'] = pd.to_datetime(df['created_at'])
#Sort
sorted_df = df.sort_values(by='created_at', ascending=True)
#Top5
top_5_logins = sorted_df['login'].head(5)
#Print
print(','.join(top_5_logins))


bennyzen,aalmiray,pvillega,tarsius,amaunz


In [None]:
#3
import pandas as pd
#Read
df = pd.read_csv('repositories.csv')
#Filter Missing
df = df.dropna(subset=['license_name'])
df = df[df['license_name'].str.strip() != '']
#Top3
top_3_licenses = df['license_name'].value_counts().head(3)
for license_name, count in top_3_licenses.items():
    print(f"{license_name}: {count}")


MIT License: 3089
Apache License 2.0: 1593
Other: 1073


In [None]:
#4
import pandas as pd
#Read
users_df = pd.read_csv('users.csv')
company_counts = users_df['company'].value_counts()
#Max
most_common_company = company_counts.idxmax()
print(most_common_company)


UNIVERSITY OF BASEL


In [None]:
#5
import pandas as pd
#read
repositories_df = pd.read_csv('repositories.csv')
language_counts = repositories_df['language'].value_counts()
#max
most_popular_language = language_counts.idxmax()
print(most_popular_language)


JavaScript


In [118]:
#6
import pandas as pd
from datetime import datetime
#Read
users_df = pd.read_csv('users.csv', parse_dates=['created_at'])
repositories_df = pd.read_csv('repositories.csv')
#Handling Timezone
users_df['created_at'] = pd.to_datetime(users_df['created_at'], format='%Y-%m-%dT%H:%M:%SZ').dt.tz_localize(None)
#Filter
filtered_users = users_df[users_df['created_at'] > datetime(2020, 1, 1)]
filtered_logins = filtered_users['login'].unique()
filtered_repositories = repositories_df[repositories_df['login'].isin(filtered_logins)]
#Frequency
language_counts = filtered_repositories['language'].value_counts()
#2nd Max
second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None
print(second_most_popular_language)


HTML


In [None]:
#7
import pandas as pd
#Read
repositories_df = pd.read_csv('repositories.csv')
# GroupBy language and Calculate Avg
average_stars_by_language = repositories_df.groupby('language')['stargazers_count'].mean()
#Max Avg Stars
most_popular_language = average_stars_by_language.idxmax()
highest_average_stars = average_stars_by_language.max()

print(most_popular_language)


PureScript


In [None]:
#8
import pandas as pd
#Read
users_df = pd.read_csv('users.csv')
#Calculate Leader Strength
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
#Sort
sorted_users = users_df.sort_values(by='leader_strength', ascending=False)
#Top5
top_5_logins = sorted_users['login'].head(5).tolist()
top_5_logins_str = ','.join(top_5_logins)
print(top_5_logins_str)


dpryan79,wasserth,ravage84,elanmart,quadbiolab


In [None]:
#9
import pandas as pd
#Read
users_df = pd.read_csv('users.csv')
#Corellation
correlation = users_df['followers'].corr(users_df['public_repos'])
rounded_correlation = round(correlation, 3)

print(rounded_correlation)


0.345


In [None]:
#10
import pandas as pd
import statsmodels.api as sm
#Read
users_df = pd.read_csv('users.csv')
#X and Y
X = users_df['public_repos']
y = users_df['followers']
#Intercept
X = sm.add_constant(X)
#Linear Regression
model = sm.OLS(y, X).fit()
#Coefficient
repo_coefficient = model.params['public_repos']
print(f"{repo_coefficient:.3f}")


0.674


In [None]:
#11
import pandas as pd
#read
repositories_df = pd.read_csv('repositories.csv' , keep_default_na=False)
#Corellation
correlation = repositories_df['has_projects'].corr(repositories_df['has_wiki'])
print(round(correlation, 3))


0.262


In [None]:
#12
import pandas as pd
#read
users_df = pd.read_csv('users.csv')
#filter
hireable_users = users_df[users_df['hireable'] == 'true']
non_hireable_users = users_df[users_df['hireable'] != 'true']
#Average
avg_following_hireable = hireable_users['following'].mean()
avg_following_non_hireable = non_hireable_users['following'].mean()
#Diff
difference = round(avg_following_hireable - avg_following_non_hireable, 3)
print(difference)

45.899


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
#read
data = pd.read_csv('users.csv')
#Filter
data_with_bios = data.dropna(subset=['bio'])
#wordcount
data_with_bios.loc[:, 'bio_word_count'] = data_with_bios['bio'].apply(lambda x: len(x.split()))
#X and Y
X = data_with_bios['bio_word_count'].values.reshape(-1, 1)  # Independent variable
y = data_with_bios['followers'].values  # Dependent variable
regressor = LinearRegression()
regressor.fit(X, y)
#Slope
slope = regressor.coef_[0]
print(round(slope,3))


2.404


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_with_bios.loc[:, 'bio_word_count'] = data_with_bios['bio'].apply(lambda x: len(x.split()))


In [None]:
#14
import csv
from datetime import datetime
#weekend repo dict
weekend_repo_counts = {}
#read
with open('repositories.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        created_at = datetime.strptime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ')
        if created_at.weekday() in (5, 6):
            login = row['login']
            if login in weekend_repo_counts:
                weekend_repo_counts[login] += 1
            else:
                weekend_repo_counts[login] = 1
#Sort
sorted_users = sorted(weekend_repo_counts.items(), key=lambda x: x[1], reverse=True)
#Top5
top_5_users = [user[0] for user in sorted_users[:5]]
print(','.join(top_5_users))


dpryan79,syzer,ioolkos,maysam,pvillega


In [None]:
#15
import pandas as pd
#read
users_df = pd.read_csv('users.csv')
#filter
hireable_users = users_df[users_df['hireable'] == 'true']
non_hireable_users = users_df[users_df['hireable'] != 'true']
#calculations
fraction_with_email_hireable = hireable_users['email'].notna().mean()
fraction_with_email_non_hireable = non_hireable_users['email'].notna().mean()
#diff
difference = fraction_with_email_hireable - fraction_with_email_non_hireable
print(f"{difference:.3f}")


0.066


In [None]:
#16
import csv
from collections import defaultdict
surname_counts = defaultdict(int)
#read
with open('users.csv', newline='') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        name = row['name'].strip() if row['name'] else ''
        if name:
            surname = name.split()[-1]
            surname_counts[surname] += 1
#max count
max_count = max(surname_counts.values())
#surnames with max count
most_common_surnames = [surname for surname, count in surname_counts.items() if count == max_count]
#Sort
most_common_surnames.sort()
print(','.join(most_common_surnames))


Arnold,Brand,Christensen,Fink,GmbH,Group,Guggisberg,Landolt,Roth,Tan
