In [None]:
!pip install requests pandas python-dotenv


Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1


In [None]:
import os
os.environ["GITHUB_TOKEN"] = "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"


In [None]:
import requests
import pandas as pd

# Set up headers with authentication
headers = {
    "Authorization": f"token {os.getenv('GITHUB_TOKEN')}"
}

def fetch_users(location="Barcelona", min_followers=100, per_page=100, max_pages=5):
    users = []
    url = "https://api.github.com/search/users"
    query = f"location:{location} followers:>{min_followers}"

    for page in range(1, max_pages + 1):
        params = {
            "q": query,
            "per_page": per_page,
            "page": page
        }
        response = requests.get(url, headers=headers, params=params)
        data = response.json()

        if "items" in data:
            users.extend(data["items"])
        else:
            break

    return users

# Fetch users and check the result
users = fetch_users()
print(f"Total users fetched: {len(users)}")


Total users fetched: 337


In [None]:
def fetch_user_details(usernames):
    user_data = []

    for username in usernames:
        url = f"https://api.github.com/users/{username}"
        response = requests.get(url, headers=headers)
        user_info = response.json()

        # Ensure 'company' is a string or default to an empty string
        company = user_info.get("company", "")
        if company is not None:
            company = company.strip().lstrip("@").upper()
        else:
            company = ""

        # Add only relevant fields
        user_data.append({
            "login": user_info.get("login", ""),
            "name": user_info.get("name", ""),
            "company": company,
            "location": user_info.get("location", ""),
            "email": user_info.get("email", ""),
            "hireable": user_info.get("hireable", ""),
            "bio": user_info.get("bio", ""),
            "public_repos": user_info.get("public_repos", 0),
            "followers": user_info.get("followers", 0),
            "following": user_info.get("following", 0),
            "created_at": user_info.get("created_at", "")
        })

    return user_data

# Example usage
usernames = [user["login"] for user in users]  # Extract usernames from fetched users
user_details = fetch_user_details(usernames)

# Save to CSV
user_df = pd.DataFrame(user_details)
user_df.to_csv("users.csv", index=False)
print("User details saved to users.csv")


User details saved to users.csv


In [None]:
def fetch_user_repos(username, max_repos=500):
    repos = []
    url = f"https://api.github.com/users/{username}/repos"
    params = {
        "sort": "pushed",  # Sort by most recently pushed repositories
        "per_page": 100
    }

    page = 1
    while len(repos) < max_repos:
        response = requests.get(url, headers=headers, params={**params, "page": page})
        page_repos = response.json()

        if not page_repos:
            break

        for repo in page_repos:
            # Ensure that nested 'license' field is accessed safely
            license_name = repo.get("license", {}).get("key") if repo.get("license") else ""

            repos.append({
                "login": username,
                "full_name": repo.get("full_name", ""),
                "created_at": repo.get("created_at", ""),
                "stargazers_count": repo.get("stargazers_count", 0),
                "watchers_count": repo.get("watchers_count", 0),
                "language": repo.get("language", ""),
                "has_projects": repo.get("has_projects", ""),
                "has_wiki": repo.get("has_wiki", ""),
                "license_name": license_name
            })

        if len(page_repos) < params["per_page"]:
            break

        page += 1

    return repos

# Fetch repositories for each user and save to CSV
repo_data = []
for username in usernames:
    repo_data.extend(fetch_user_repos(username))

# Save to CSV
repo_df = pd.DataFrame(repo_data)
repo_df.to_csv("repositories.csv", index=False)
print("Repository details saved to repositories.csv")


Repository details saved to repositories.csv


In [None]:
from google.colab import files

files.download("users.csv")
files.download("repositories.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

# Load data from CSV files
users_df = pd.read_csv("users.csv")
repos_df = pd.read_csv("repositories.csv")


In [None]:
top_5_followers = users_df.nlargest(5, 'followers')['login']
print("Top 5 users with the highest followers:", ", ".join(top_5_followers))


Top 5 users with the highest followers: midudev, ai, raysan5, vfarcic, spite


In [None]:
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
earliest_5_users = users_df.nsmallest(5, 'created_at')['login']
print("5 earliest registered users:", ", ".join(earliest_5_users))


5 earliest registered users: oleganza, gravityblast, fesplugas, fxn, pauek


In [None]:
# Drop any rows where license_name is empty
licenses = repos_df[repos_df['license_name'] != ""]
top_3_licenses = licenses['license_name'].value_counts().nlargest(3).index
print("3 most popular licenses:", ", ".join(top_3_licenses))


3 most popular licenses: mit, apache-2.0, other


In [None]:
most_common_company = users_df['company'].mode()[0]
print("Most common company:", most_common_company)


Most common company: FREELANCE


In [None]:
most_popular_language = repos_df['language'].mode()[0]
print("Most popular programming language:", most_popular_language)


Most popular programming language: JavaScript


In [None]:
# Filter users who joined after 2020 and get their repositories
after_2020 = users_df[users_df['created_at'] >= '2020-01-01']
repos_after_2020 = repos_df[repos_df['login'].isin(after_2020['login'])]

second_most_popular_language = repos_after_2020['language'].value_counts().nlargest(2).index[1]
print("Second most popular language after 2020:", second_most_popular_language)


Second most popular language after 2020: Python


In [None]:
# Calculate average stars per language
avg_stars_language = repos_df.groupby('language')['stargazers_count'].mean().idxmax()
print("Language with the highest average stars:", avg_stars_language)


Language with the highest average stars: Vim Script


In [None]:
# Calculate leader strength and find top 5
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])
top_5_leader_strength = users_df.nlargest(5, 'leader_strength')['login']
print("Top 5 users by leader strength:", ", ".join(top_5_leader_strength))


Top 5 users by leader strength: midudev, vfarcic, spite, amix, cfenollosa


In [None]:
correlation_followers_repos = users_df['followers'].corr(users_df['public_repos'])
print("Correlation between followers and public repos:", round(correlation_followers_repos, 3))


Correlation between followers and public repos: 0.071


In [None]:
from scipy.stats import linregress

slope, intercept, _, _, _ = linregress(users_df['public_repos'], users_df['followers'])
print("Regression slope of followers on repos:", round(slope, 3))


Regression slope of followers on repos: 1.031


In [None]:
correlation_projects_wiki = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))
print("Correlation between projects and wiki enabled:", round(correlation_projects_wiki, 3))


Correlation between projects and wiki enabled: 0.323


In [None]:
# Check for NaN values in the 'hireable' and 'following' columns
print("NaN values in 'hireable' column:", users_df['hireable'].isna().sum())
print("NaN values in 'following' column:", users_df['following'].isna().sum())


NaN values in 'hireable' column: 0
NaN values in 'following' column: 0


In [None]:
print(users_df.dtypes)

login                           object
name                            object
company                         object
location                        object
email                           object
hireable                          bool
bio                             object
public_repos                     int64
followers                        int64
following                        int64
created_at         datetime64[ns, UTC]
leader_strength                float64
dtype: object


In [None]:
# Check unique values in 'hireable' to see if there are any unexpected entries
print("Unique values in 'hireable':", users_df['hireable'].unique())


Unique values in 'hireable': [False]


In [None]:
print(users_df['hireable'].value_counts())


hireable
False    337
Name: count, dtype: int64


In [None]:
# Display all unique values in the 'hireable' column along with their counts
print(users_df['hireable'].value_counts(dropna=False))


hireable
False    337
Name: count, dtype: int64


In [None]:
# Display rows where hireable is "TRUE"
print(users_df[users_df['hireable'] == "TRUE"])


Empty DataFrame
Columns: [login, name, company, location, email, hireable, bio, public_repos, followers, following, created_at, leader_strength]
Index: []


In [None]:
# Add bio word count column
users_df['bio_word_count'] = users_df['bio'].fillna('').apply(lambda x: len(x.split()))

# Perform regression on bio word count and followers
slope, _, _, _, _ = linregress(users_df['bio_word_count'], users_df['followers'])
print("Regression slope of followers on bio word count:", round(slope, 3))


Regression slope of followers on bio word count: 19.02


In [None]:
# Parse created_at for repos and filter by weekend
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])
repos_df['is_weekend'] = repos_df['created_at'].dt.weekday >= 5

weekend_repos = repos_df[repos_df['is_weekend']].groupby('login').size().nlargest(5).index
print("Top 5 users with most repos created on weekends:", ", ".join(weekend_repos))


Top 5 users with most repos created on weekends: kinow, nilportugues, ajsb85, vfarcic, wlsf82


In [None]:
# Count the total number of users for hireable and non-hireable groups
total_hireable = users_df[users_df['hireable']].shape[0]
total_non_hireable = users_df[~users_df['hireable']].shape[0]

# Count users with email addresses for each group
email_hireable = users_df[users_df['hireable']]['email'].notna().sum()
email_non_hireable = users_df[~users_df['hireable']]['email'].notna().sum()

# Initialize fractions to 0
fraction_hireable = 0
fraction_non_hireable = 0

# Calculate the fractions if there are users in the respective groups
if total_hireable > 0:
    fraction_hireable = email_hireable / total_hireable

if total_non_hireable > 0:
    fraction_non_hireable = email_non_hireable / total_non_hireable

# Calculate the difference
email_difference = round(fraction_hireable - fraction_non_hireable, 3)

print("Fraction of users with email when hireable=true:", round(fraction_hireable, 3))
print("Fraction of users with email when hireable=false:", round(fraction_non_hireable, 3))
print("Difference in fraction of users with email (hireable - non-hireable):", email_difference)



Fraction of users with email when hireable=true: 0
Fraction of users with email when hireable=false: 0.496
Difference in fraction of users with email (hireable - non-hireable): -0.496


In [None]:
# Display the first few rows of the 'name' column
print(users_df['name'].head(10))


0    Miguel Ángel Durán
1         Andrey Sitnik
2                   Ray
3         Viktor Farcic
4         Jaume Sanchez
5    Julien Le Coupanec
6          Pedro Duarte
7                    JK
8        Davide Faconti
9         Carlos Cuesta
Name: name, dtype: object


In [None]:
# Check for NaN values in the 'name' column
print("NaN values in 'name' column:", users_df['name'].isna().sum())

# Check for empty strings
empty_names = users_df[users_df['name'].str.strip() == '']
print("Rows with empty 'name':", empty_names.shape[0])

NaN values in 'name' column: 7
Rows with empty 'name': 0


In [None]:
# Step 1: Extract the last names with debugging
def extract_surname(name):
    if isinstance(name, str) and len(name.strip()) > 0:
        surname = name.strip().split()[-1]
        print(f"Extracted surname: {surname}")  # Debug output
        return surname
    else:
        print("No valid name found.")
        return ''

users_df['surname'] = users_df['name'].apply(extract_surname)

# Display the first few extracted surnames
print(users_df['surname'].head(10))

Extracted surname: Durán
Extracted surname: Sitnik
Extracted surname: Ray
Extracted surname: Farcic
Extracted surname: Sanchez
Extracted surname: Coupanec
Extracted surname: Duarte
Extracted surname: JK
Extracted surname: Faconti
Extracted surname: Cuesta
Extracted surname: Csárdi
Extracted surname: Salihefendic
Extracted surname: Noria
Extracted surname: Iturbides
Extracted surname: Gómez
Extracted surname: Fenollosa
Extracted surname: Orlandi
Extracted surname: Fueris
Extracted surname: Walmyr
Extracted surname: Bastidas
Extracted surname: Buenosvinos
Extracted surname: Mario
Extracted surname: Rubanov
Extracted surname: González
Extracted surname: Héctor
Extracted surname: Agenjo
Extracted surname: Kaufmann
Extracted surname: GM
Extracted surname: Ansio
Extracted surname: Andreev
Extracted surname: Climent
Extracted surname: Lange
Extracted surname: Bartolome
Extracted surname: Pavlutin
Extracted surname: Toolkit
Extracted surname: Perry
Extracted surname: Cornellà
Extracted surname

In [None]:
# Remove blank surnames from the DataFrame
users_df_filtered = users_df[users_df['surname'] != '']

# Count occurrences of each surname from the filtered DataFrame
surname_counts = users_df_filtered['surname'].value_counts()

# Get the top 2 most common surnames
top_surnames = surname_counts.nlargest(2)

# Check if there is a second most common surname
if len(top_surnames) > 1:
    second_most_common_surname = top_surnames.index[1]
else:
    second_most_common_surname = "No second most common surname found"

# Print the result
print("Second most common surname:", second_most_common_surname)


Second most common surname: Ortiz


In [None]:
# Remove blank surnames from the DataFrame
users_df_filtered = users_df[users_df['surname'] != '']

# Count occurrences of each surname from the filtered DataFrame
surname_counts = users_df_filtered['surname'].value_counts()

# Get the top 2 most common surnames
top_surnames = surname_counts.nlargest(3)

# Check if there is a second most common surname
if len(top_surnames) > 2:
    second_most_common_surname = top_surnames.index[2]
else:
    second_most_common_surname = "No second most common surname found"

# Print the result
print("Second most common surname:", second_most_common_surname)


Second most common surname: Pérez


In [None]:
# Step 1: Filter out empty surnames
valid_surnames = users_df['surname'][users_df['surname'] != '']

# Step 2: Count the occurrences of each surname
surname_counts = valid_surnames.value_counts()

# Step 3: Get the maximum count
max_count = surname_counts.max()

# Step 4: Get all surnames with the maximum count
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

# Step 5: Sort the surnames alphabetically
most_common_surnames.sort()

# Step 6: Join the surnames for output
output_surnames = ', '.join(most_common_surnames)

print("Most common surname(s):", output_surnames)


Most common surname(s): Martínez, Ortiz


In [None]:
# Remove blank surnames from the DataFrame
users_df_filtered = users_df[users_df['surname'] != '']

# Count occurrences of each surname from the filtered DataFrame
surname_counts = users_df_filtered['surname'].value_counts()

# Get the top 5 most common surnames
top_surnames = surname_counts.nlargest(5)

# Print the results
print("Top 5 most common surnames with counts:")
for surname, count in top_surnames.items():
    print(f"{surname}: {count}")


Top 5 most common surnames with counts:
Martínez: 3
Ortiz: 3
Pérez: 2
Academy: 2
Sanchez: 2


In [None]:
# Assuming your repositories DataFrame is called repositories_df
correlation = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))
print("Correlation between projects and wiki enabled:", round(correlation, 3))


Correlation between projects and wiki enabled: 0.323


In [None]:
# Calculate the correlation
correlation_projects_wiki = repos_df['has_projects'].astype(int).corr(repos_df['has_wiki'].astype(int))

# Print the correlation rounded to 3 decimal places
print("Correlation between projects and wiki enabled:", round(correlation_projects_wiki, 3))


Correlation between projects and wiki enabled: 0.323


In [56]:
import pandas as pd
from scipy.stats import linregress

# Ensure you load your data into a DataFrame
# For example, if you're reading from a CSV:
# users_df = pd.read_csv('path_to_your_users.csv')

# OR, if you have a list of users dictionaries:
# users_df = pd.DataFrame(users)

# Check the DataFrame
print(users_df.head())  # Print the first few rows to verify data

# Filter out users with a non-empty bio
users_with_bio = users_df[users_df['bio'].notna()].copy().reset_index(drop=True)

# Calculate the length of each bio
users_with_bio['bio_length'] = users_with_bio['bio'].str.strip().str.split().str.len()

# Perform linear regression
slope = linregress(users_with_bio['bio_length'], users_with_bio['followers']).slope

# Print the slope rounded to 3 decimal places
print(f"Regression slope of followers on bio word count: {slope:.3f}")


     login                name          company            location  \
0  midudev  Miguel Ángel Durán              NaN           Barcelona   
1       ai       Andrey Sitnik     EVILMARTIANS    Barcelona, Spain   
2  raysan5                 Ray       RAYLIBTECH           Barcelona   
3  vfarcic       Viktor Farcic          UPBOUND    Barcelona, Spain   
4    spite       Jaume Sanchez  GOOGLE-DEEPMIND  London · Barcelona   

                      email  hireable  \
0          miduga@gmail.com     False   
1          andrey@sitnik.ru     False   
2         raysan5@gmail.com     False   
3         viktor@farcic.com     False   
4  hello@clicktorelease.com     False   

                                                 bio  public_repos  followers  \
0  Te enseño Programación y Desarrollo Web. Cread...           194      28304   
1  The creator of Autoprefixer, @postcss, @browse...            85       9158   
2  I make tools and technology for videogames dev...            26       3572   
3 

https://github.com/AkshatGupta327/TDS_proj_1
midudev,ai,raysan5,vfarcic,spite
oleganza,gravityblast,fesplugas,fxn,pauek
mit,apache-2.0,other
FREELANCE
JavaScript
Python
Vim Script
midudev,vfarcic,spite,amix,cfenollosa
0.071
1.031
NaN
NaN
13.733
kinow,nilportugues,ajsb85,vfarcic,wlsf82
NaN
Martínez,Ortiz