In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import linregress

# Load your CSV files (replace 'path_to_users.csv' and 'path_to_repositories.csv' with actual file paths)
users = pd.read_csv('users.csv')
repos = pd.read_csv('repository.csv')

# Display the first few rows to understand the structure
print(users.head())
print(repos.head())


In [None]:
melbourne_users = users[users['location'].str.contains('Melbourne', case=False, na=False)]
top_5_melbourne = melbourne_users.nlargest(5, 'followers')['login'].tolist()
print("Top 5 users in Melbourne with the highest followers:", ', '.join(top_5_melbourne))

In [None]:
earliest_users = melbourne_users.sort_values('created_at').head(5)['login'].tolist()
print("5 earliest registered users in Melbourne:", ', '.join(earliest_users))

In [None]:
top_licenses = repos['license_name'].value_counts().head(3).index.tolist()
print("3 most popular licenses:", ', '.join(top_licenses))

In [None]:
most_common_company = users['company'].mode()[0]
print("Most common company:", most_common_company)

In [None]:
most_popular_language = repos['language'].mode()[0]
print("Most popular programming language:", most_popular_language)

In [None]:
users_after_2020 = users[pd.to_datetime(users['created_at']).dt.year > 2020]
second_popular_language = repos[repos['language'].isin(users_after_2020['login'])]['language'].value_counts().index[1]
print("Second most popular language:", second_popular_language)

In [None]:
avg_stars_language = repos.groupby('language')['stargazers_count'].mean().idxmax()
print("Language with highest average stars:", avg_stars_language)

In [None]:
users['follower_score'] = users['followers'] + 2 * users['following']
top_5_follow_score = users.nlargest(5, 'follower_score')['login'].tolist()
print("Top 5 users based on follower score:", ', '.join(top_5_follow_score))

In [None]:
correlation_follow_repos = users[['followers', 'public_repos']].corr().iloc[0,1]
print("Correlation between followers and public repos:", round(correlation_follow_repos, 3))

In [None]:
slope, intercept, _, _, _ = linregress(users['public_repos'], users['followers'])
print("Regression slope of followers on repos:", round(slope, 3))

In [None]:
project_wiki_corr = repos[['has_projects', 'has_wiki']].corr().iloc[0, 1]
print("Correlation between projects and wiki enabled:", round(project_wiki_corr, 3))

In [None]:
avg_following_hireable = users[users['hireable'] == True]['following'].mean()
avg_following_non_hireable = users[users['hireable'] == False]['following'].mean()
difference = round(avg_following_hireable - avg_following_non_hireable, 3)
print("Difference in average following (hireable - non-hireable):", difference)

In [None]:
users['bio_word_count'] = users['bio'].fillna('').apply(lambda x: len(x.split()))
slope, _, _, _, _ = linregress(users['bio_word_count'], users['followers'])
print("Regression slope of followers on bio word count:", round(slope, 3))

In [None]:
repos['created_at'] = pd.to_datetime(repos['created_at'])
repos['is_weekend'] = repos['created_at'].dt.weekday >= 5
weekend_repos_count = repos[repos['is_weekend']].groupby('owner_login').size()
top_5_weekend_users = weekend_repos_count.nlargest(5).index.tolist()
print("Top 5 users with most repos on weekends:", ', '.join(top_5_weekend_users))

In [None]:
users['last_name'] = users['name'].str.split().str[-1]
common_surname = users['last_name'].mode()[0]
common_surname_count = users['last_name'].value_counts().loc[common_surname]
print("Number of users with the most common surname:", common_surname_count)