**Starting with importing required libraries**

In [17]:
import pandas as pd
import numpy as np
import csv
from datetime import datetime
from collections import Counter
import matplotlib.pyplot as plt

**Mounting Drive and uploading both csv files to it**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Q1 Who are the top 5 users in Tokyo with the highest number of followers? List their login in order, comma-separated.


In [13]:
# Define the list to store users from Tokyo
users_in_Tokyo = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        location = row['location'].strip().lower()
        # Check if the user is from Tokyo
        if 'tokyo' in location:
            users_in_Tokyo.append({
                'login': row['login'],
                'followers': int(row['followers'])
            })

# Sort users based on followers in descending order
top_users = sorted(users_in_Tokyo, key=lambda x: x['followers'], reverse=True)

# Extract the top 5 user logins
top_5_logins = [user['login'] for user in top_users[:5]]

# Print the result as a comma-separated list
print(','.join(top_5_logins))


dennybritz,wasabeef,dai-shi,rui314,domenic


# Q2  Who are the 5 earliest registered GitHub users in Tokyo? List their login in ascending order of created_at, comma-separated.

In [16]:
# Define the list to store users from Tokyo
users_in_tokyo = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        location = row['location'].strip().lower()
        # Check if the user is from Tokyo
        if 'tokyo' in location:
            users_in_tokyo.append({
                'login': row['login'],
                'created_at': datetime.strptime(row['created_at'], '%Y-%m-%dT%H:%M:%SZ')
            })

# Sort users based on created_at in ascending order
sorted_users = sorted(users_in_tokyo, key=lambda x: x['created_at'])

# Extract the top 5 user logins
top_5_earliest_logins = [user['login'] for user in sorted_users[:5]]

# Print the result as a comma-separated list
print(','.join(top_5_earliest_logins))

kana,kakutani,mootoh,lhl,walf443


# Q3  What are the 3 most popular license among these users? Ignore missing licenses. List the license_name in order, comma-separated.

In [18]:
# Define the list to store license names
licenses = []

# Read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Check if the license_name field is present and not empty
        license_name = row.get('license_name', '').strip()
        if license_name:
            licenses.append(license_name)

# Count the occurrence of each license
license_counts = Counter(licenses)

# Get the 3 most common licenses
top_3_licenses = [license for license, count in license_counts.most_common(3)]

# Print the result as a comma-separated list
print(','.join(top_3_licenses))


mit,apache-2.0,other


# Q4  Which company do the majority of these developers work at?

In [19]:
# Define the list to store company names
companies = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Get and clean up the company field (ignore empty values)
        company = row.get('company', '').strip()
        if company:
            companies.append(company)

# Count the occurrence of each company
company_counts = Counter(companies)

# Find the most common company
most_common_company = company_counts.most_common(1)

# Print the result
if most_common_company:
    print(most_common_company[0][0])
else:
    print("No company data found.")


GOOGLE


# Q5  Which programming language is most popular among these users?

In [20]:
# Define the list to store programming languages
languages = []

# Read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        # Get and clean up the language field (ignore empty values)
        language = row.get('language', '').strip()
        if language:
            languages.append(language)

# Count the occurrence of each language
language_counts = Counter(languages)

# Find the most common language
most_common_language = language_counts.most_common(1)

# Print the result
if most_common_language:
    print(most_common_language[0][0])
else:
    print("No language data found.")

JavaScript


# Q6  Which programming language is the second most popular among users who joined after 2020?

In [29]:
# Load data from CSV files
users_df = pd.read_csv('users.csv')
repos_df = pd.read_csv('repositories.csv')

# Convert user creation date to datetime and filter users created after 2020
users_df['created_at'] = pd.to_datetime(users_df['created_at'])
recent_users = users_df[users_df['created_at'] >= '2020-01-01']

# Merge recent users with repos data to find the languages they use
recent_user_repos = pd.merge(recent_users[['login']], repos_df[['login', 'language']], on='login')
language_counts = recent_user_repos['language'].value_counts()

# Identify the second most popular language
second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None
second_most_popular_language






'Rust'

# Q7 Which language has the highest average number of stars per repository?

In [28]:
from collections import defaultdict

# Define a dictionary to store total stars and repository count per language
language_stats = defaultdict(lambda: {'stars': 0, 'repos': 0})

# Read the CSV file with UTF-8 encoding
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        # Get the language and stargazers_count field
        language = row.get('language', '').strip()
        stars = row.get('stargazers_count', '0').strip()

        # Only process if language and stars are available
        if language and stars.isdigit():
            language_stats[language]['stars'] += int(stars)
            language_stats[language]['repos'] += 1

# Calculate average stars for each language
average_stars_per_language = {
    language: stats['stars'] / stats['repos']
    for language, stats in language_stats.items()
    if stats['repos'] > 0
}

# Find the language with the highest average stars
if average_stars_per_language:
    most_popular_language = max(average_stars_per_language, key=average_stars_per_language.get)
    print(most_popular_language)
else:
    print("No language data found.")


Assembly


# Q8 Let's define leader_strength as followers / (1 + following). Who are the top 5 in terms of leader_strength? List their login in order, comma-separated.




In [31]:
# Define a list to store users and their leader strength
leader_strengths = []

# Read the CSV file with UTF-8 encoding
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        # Get followers and following counts
        followers = int(row.get('followers', 0).strip())
        following = int(row.get('following', 0).strip())

        # Calculate leader strength
        leader_strength = followers / (1 + following)

        # Store the user's login and their leader strength
        leader_strengths.append((row.get('login', ''), leader_strength))

# Sort users by leader strength in descending order
leader_strengths.sort(key=lambda x: x[1], reverse=True)

# Get the top 5 users
top_5_leaders = [login for login, strength in leader_strengths[:5]]

# Print the result as a comma-separated list
print(','.join(top_5_leaders))

blueimp,dai-shi,asahilina,pilcrowonpaper,marcan


# Q9 What is the correlation between the number of followers and the number of public repositories among users in Tokyo?

Correlation between followers and repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [32]:
# Lists to store the followers and public repos of users from Tokyo
followers = []
public_repos = []

# Open the users.csv file and read data
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        # Filter for users in Tokyo
        location = row.get('location', '').strip().lower()
        if "tokyo" in location:  # Ensuring case-insensitive match
            # Get followers and public repositories values
            try:
                followers_count = int(row['followers'].strip() or 0)  # Handle empty as zero
                public_repos_count = int(row['public_repos'].strip() or 0)  # Handle empty as zero

                # Append the valid values to the lists
                followers.append(followers_count)
                public_repos.append(public_repos_count)
            except ValueError:
                # Skip rows with invalid numerical values
                continue

# Ensure there is data to compute correlation
if len(followers) > 1 and len(public_repos) > 1:
    # Compute Pearson correlation coefficient
    correlation_matrix = np.corrcoef(followers, public_repos)
    correlation = correlation_matrix[0, 1]
    # Output correlation rounded to 3 decimal places
    print(f"{correlation:.3f}")
else:
    print("Insufficient data for correlation calculation.")

0.051


# Q10  Does creating more repos help users get more followers? Using regression, estimate how many additional followers a user gets per additional public repository.

Regression slope of followers on repos (to 3 decimal places, e.g. 0.123 or -0.123)

In [33]:
from scipy.stats import linregress

# Lists to store the followers and public repos of users
followers = []
public_repos = []

# Open the users.csv file and read data
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        try:
            # Get followers and public repositories values
            followers_count = int(row['followers'].strip() or 0)  # Treat empty as zero
            public_repos_count = int(row['public_repos'].strip() or 0)  # Treat empty as zero

            # Append the valid values to the lists
            followers.append(followers_count)
            public_repos.append(public_repos_count)
        except ValueError:
            # Skip rows with invalid numerical values
            continue

# Ensure there is data to perform regression
if len(followers) > 1 and len(public_repos) > 1:
    # Perform linear regression
    slope, intercept, r_value, p_value, std_err = linregress(public_repos, followers)

    # Output regression slope rounded to 3 decimal places
    print(f"{slope:.3f}")
else:
    print("Insufficient data for regression calculation.")

0.279


# Q11  Do people typically enable projects and wikis together? What is the correlation between a repo having projects enabled and having wiki enabled?


Correlation between projects and wiki enabled (to 3 decimal places, e.g. 0.123 or -0.123)

In [35]:
def analyze_repo_features(csv_file):

    df = pd.read_csv(csv_file)

    if df['has_projects'].dtype == 'object':
        df['has_projects'] = df['has_projects'].map({'true': True, 'false': False})
    if df['has_wiki'].dtype == 'object':
        df['has_wiki'] = df['has_wiki'].map({'true': True, 'false': False})

    correlation = df['has_projects'].corr(df['has_wiki'])

    stats = {
        'total_repos': len(df),
        'projects_enabled': df['has_projects'].sum(),
        'wiki_enabled': df['has_wiki'].sum(),
        'both_enabled': ((df['has_projects']) & (df['has_wiki'])).sum(),
        'neither_enabled': ((~df['has_projects']) & (~df['has_wiki'])).sum()
    }

    return round(correlation, 3), stats

correlation, stats = analyze_repo_features('repositories.csv')
print(f"Correlation coefficient: {correlation}")

Correlation coefficient: 0.426


# Q12 Do hireable users follow more people than those who are not hireable?


Average of following per user for hireable=true minus the average following for the rest (to 3 decimal places, e.g. 12.345 or -12.345)

In [36]:
def analyze_following_difference(users_csv_path='users.csv'):
    # Read the data
    df = pd.read_csv(users_csv_path)

    # Calculate average following for hireable users
    hireable_following = df[df['hireable'] == True]['following'].mean()

    # Calculate average following for non-hireable users
    non_hireable_following = df[df['hireable'] != True]['following'].mean()

    # Calculate the difference rounded to 3 decimal places
    difference = round(hireable_following - non_hireable_following, 3)

    # Print debug information
    print(f"Number of hireable users: {len(df[df['hireable'] == True])}")
    print(f"Number of non-hireable users: {len(df[df['hireable'] != True])}")
    print(f"Average following for hireable users: {hireable_following:.3f}")
    print(f"Average following for non-hireable users: {non_hireable_following:.3f}")

    return difference

# Calculate the difference
result = analyze_following_difference()
print(f"\nDifference in average following: {result:.3f}")

Number of hireable users: 182
Number of non-hireable users: 363
Average following for hireable users: 180.467
Average following for non-hireable users: 259.909

Difference in average following: -79.442


# Q13 Some developers write long bios. Does that help them get more followers? What's the impact of the length of their bio (in Unicode words, split by whitespace) with followers? (Ignore people without bios)


Regression slope of followers on bio word count (to 3 decimal places, e.g. 12.345 or -12.345)

In [39]:
from scipy.stats import linregress

# Load the users.csv file
users_df = pd.read_csv('users.csv')

# Filter out rows where 'bio' is empty or NaN and ensure followers are a valid integer
filtered_users = users_df.dropna(subset=['bio', 'followers'])
filtered_users = filtered_users[filtered_users['followers'].apply(lambda x: str(x).isdigit())]

# Calculate the length of bio in terms of word count (split by whitespace)
filtered_users['bio_word_count'] = filtered_users['bio'].apply(lambda x: len(str(x).split()))
filtered_users['followers'] = filtered_users['followers'].astype(int)

# Perform linear regression: followers vs. bio_word_count
slope, intercept, r_value, p_value, std_err = linregress(filtered_users['bio_word_count'], filtered_users['followers'])

# Output the regression slope rounded to 3 decimal places
slope_rounded = round(slope, 3)
slope_rounded

19.037

# Q14 Who created the most repositories on weekends (UTC)? List the top 5 users' login in order, comma-separated

In [40]:
# Counter to store the number of repositories created by each user on weekends
weekend_repo_counts = Counter()

# Open the repositories.csv file and read data
with open('repositories.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        created_at = row.get('created_at', '')
        if created_at:
            # Convert created_at string to a datetime object
            created_date = datetime.fromisoformat(created_at[:-1])  # Remove 'Z' and convert

            # Check if the day is Saturday (5) or Sunday (6)
            if created_date.weekday() in [5, 6]:
                user_login = row['login']
                weekend_repo_counts[user_login] += 1  # Increment the count for the user

# Get the top 5 users who created the most repositories on weekends
top_users = weekend_repo_counts.most_common(5)

# Extract the logins of the top users
top_logins = [user[0] for user in top_users]

# Output the top users' logins as a comma-separated string
print(','.join(top_logins))

azu,suzuki-shunsuke,yuiseki,xuwei-k,zchee


# Q15 Do people who are hireable share their email addresses more often?


[fraction of users with email when hireable=true] minus [fraction of users with email for the rest] (to 3 decimal places, e.g. 0.123 or -0.123)

In [43]:
# Define function to check if email is provided
def has_email(email):
    return pd.notna(email) and email.strip() != ''

# Calculate fractions for hireable and non-hireable users with an email
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'] != True]

# Fraction of hireable users with an email
hireable_with_email_fraction = hireable_users['email'].apply(has_email).mean()

# Fraction of non-hireable users with an email
non_hireable_with_email_fraction = non_hireable_users['email'].apply(has_email).mean()

# Calculate the difference in fractions
email_fraction_difference = round(hireable_with_email_fraction - non_hireable_with_email_fraction, 3)
email_fraction_difference


0.128

# Q16 Let's assume that the last word in a user's name is their surname (ignore missing names, trim and split by whitespace.) What's the most common surname? (If there's a tie, list them all, comma-separated, alphabetically)

Most common surname(s)

In [42]:
# Counter to store surname frequencies
surname_counter = Counter()

# Open the users.csv file and read data
with open('users.csv', 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file)

    for row in reader:
        name = row.get('name', '').strip()
        if name:  # Ignore missing names
            # Split the name by whitespace and get the last word as the surname
            surname = name.split()[-1]
            surname_counter[surname] += 1

# Find the maximum frequency of surnames
if surname_counter:
    max_count = max(surname_counter.values())
    # Get all surnames with the maximum frequency
    most_common_surnames = [surname for surname, count in surname_counter.items() if count == max_count]
    # Sort surnames alphabetically
    most_common_surnames.sort()
    # Output the result
    print(f"{','.join(most_common_surnames)}")
else:
    print("No names found.")

Kato,Tanaka
