In [None]:
pip install PyGithub

Collecting PyGithub
  Downloading PyGithub-2.4.0-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from PyGithub)
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.4.0-py3-none-any.whl (362 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m362.6/362.6 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, PyGithub
Successfully installed PyGithub-2.4.0 pynacl-1.5.0


In [None]:
pip install requests



Scraper script

In [None]:
import requests
import csv
import time
from github import Github

# GitHub API token
GITHUB_TOKEN = 'github token' #hidden for privacy concerns
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

# Helper function to clean up company names
def clean_company_name(company):
    if company:
        company = company.strip().lstrip('@').upper()
    return company

# Function to fetch users from the GitHub API
def fetch_users(city="Melbourne", min_followers=100):
    users = []
    page = 1

    while True:
        url = f"https://api.github.com/search/users?q=location:{city}+followers:>{min_followers}&page={page}&per_page=100"
        response = requests.get(url, headers=HEADERS)
        data = response.json()

        # Break if no more results
        if 'items' not in data or not data['items']:
            break

        for user in data['items']:
            # Get full user info
            user_url = user['url']
            user_response = requests.get(user_url, headers=HEADERS)
            user_data = user_response.json()

            # Extract required fields
            users.append({
                'login': user_data['login'],
                'name': user_data['name'],
                'company': clean_company_name(user_data['company']),
                'location': user_data['location'],
                'email': user_data['email'],
                'hireable': user_data['hireable'],
                'bio': user_data['bio'],
                'public_repos': user_data['public_repos'],
                'followers': user_data['followers'],
                'following': user_data['following'],
                'created_at': user_data['created_at'],
            })
        page += 1
        time.sleep(1)  # Avoid hitting API rate limits

    return users

# Function to fetch repositories for a user
def fetch_repositories(user_login):
    repositories = []
    page = 1

    while True:
        url = f"https://api.github.com/users/{user_login}/repos?per_page=100&page={page}"
        response = requests.get(url, headers=HEADERS)
        repo_data = response.json()

        # Break if no more repositories
        if not repo_data:
            break

        for repo in repo_data:
            repositories.append({
                'login': user_login,
                'full_name': repo['full_name'],
                'created_at': repo['created_at'],
                'stargazers_count': repo['stargazers_count'],
                'watchers_count': repo['watchers_count'],
                'language': repo['language'],
                'has_projects': repo['has_projects'],
                'has_wiki': repo['has_wiki'],
                'license_name': repo['license']['key'] if repo['license'] else None,
            })

        # If fewer than 100 repositories are returned, it means we're on the last page
        if len(repo_data) < 100:
            break

        page += 1  # Move to the next page
        time.sleep(1)  # Avoid hitting API rate limits

    return repositories

# Save users to CSV
def save_users_to_csv(users, filename="users.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=users[0].keys())
        writer.writeheader()
        writer.writerows(users)

# Save repositories to CSV
def save_repositories_to_csv(repositories, filename="repositories.csv"):
    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=repositories[0].keys())
        writer.writeheader()
        writer.writerows(repositories)

# Create README.md
def create_readme():
    with open('README.md', 'w') as file:
        file.write('''
* This project scrapes GitHub users in Melbourne with over 100 followers and their repositories.
* The most interesting fact found was the diversity of programming languages used.
* Developers should consider contributing to open-source projects to increase their visibility.

# GitHub Users and Repositories in Melbourne

This project uses the GitHub API to scrape users in Melbourne with over 100 followers and their repositories. The data is saved in `users.csv` and `repositories.csv`.

## Files

- `users.csv`: Contains information about each user.
- `repositories.csv`: Contains information about each user's repositories.

## How to Run

1. Set up a GitHub personal access token.
2. Run the script to fetch data and generate CSV files.
3. Upload the files to a GitHub repository.
''')

# Create a new GitHub repository and upload files
def create_github_repo():
    g = Github(GITHUB_TOKEN)
    user = g.get_user()
    repo = user.create_repo("MELB_USERS")

    # Upload files to the repository
    with open('users.csv', 'r') as file:
        content = file.read()
        repo.create_file('users.csv', 'Initial commit', content)

    with open('repositories.csv', 'r') as file:
        content = file.read()
        repo.create_file('repositories.csv', 'Initial commit', content)

    with open('README.md', 'r') as file:
        content = file.read()
        repo.create_file('README.md', 'Initial commit', content)

def main():
    print("Fetching users...")
    users = fetch_users()
    save_users_to_csv(users)
    print(f"Saved {len(users)} users to users.csv")

    print("Fetching repositories...")
    all_repositories = []
    for user in users:
        user_repos = fetch_repositories(user["login"])
        all_repositories.extend(user_repos)
        print(f"Fetched {len(user_repos)} repositories for user {user['login']}")

    save_repositories_to_csv(all_repositories)
    print(f"Saved {len(all_repositories)} repositories to repositories.csv")

    create_readme()
    create_github_repo()
    print("Created GitHub repository MELB_USERS and uploaded files.")

if __name__ == "__main__":
    main()

Fetching users...
Saved 331 users to users.csv
Fetching repositories...
Fetched 27 repositories for user mosh-hamedani
Fetched 35 repositories for user TheCherno
Fetched 372 repositories for user haileys
Fetched 562 repositories for user rstacruz
Fetched 60 repositories for user jesseduffield
Fetched 442 repositories for user basarat
Fetched 240 repositories for user markdalgleish
Fetched 78 repositories for user JakeLin
Fetched 54 repositories for user binarythistle
Fetched 229 repositories for user radar
Fetched 124 repositories for user mishmanners
Fetched 50 repositories for user timacdonald
Fetched 2 repositories for user TuPayChain
Fetched 156 repositories for user md-5
Fetched 171 repositories for user thomasdavis
Fetched 249 repositories for user geelen
Fetched 46 repositories for user MugunthKumar
Fetched 122 repositories for user tseemann
Fetched 316 repositories for user loftwah
Fetched 14 repositories for user rogerclarkmelbourne
Fetched 185 repositories for user jxom
Fetch

In [None]:
#Q1
import pandas as pd

# Load the data from the CSV file
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv'
users_df = pd.read_csv(url)

# Sort the DataFrame by the number of followers in descending order
sorted_users_df = users_df.sort_values(by='followers', ascending=False)

# Extract the top 5 users
top_5_users = sorted_users_df.head(5)

# List their logins in a comma-separated format
top_5_logins = ','.join(top_5_users['login'].tolist())

print(f"Top 5 users in Melbourne with the highest number of followers: {top_5_logins}")

Top 5 users in Melbourne with the highest number of followers: mosh-hamedani,TheCherno,haileys,rstacruz,jesseduffield


In [None]:
#q2
import pandas as pd

# Load the data from the CSV file
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv'
users_df = pd.read_csv(url)

# Convert the 'created_at' column to datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort the DataFrame by the 'created_at' column in ascending order
sorted_users_df = users_df.sort_values(by='created_at', ascending=True)

# Extract the 5 earliest registered users
earliest_5_users = sorted_users_df.head(5)

# List their logins in a comma-separated format
earliest_5_logins = ','.join(earliest_5_users['login'].tolist())

print(f"5 earliest registered GitHub users in Melbourne: {earliest_5_logins}")

5 earliest registered GitHub users in Melbourne: toolmantim,crafterm,dgoodlad,Sutto,mdub


In [None]:
#q3
import pandas as pd

# Load the data from the CSV file
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/repositories.csv'
repos_df = pd.read_csv(url)

# Check the first few rows of the DataFrame to understand its structure
print("First few rows of the DataFrame:")
print(repos_df.head())

# Check the columns of the DataFrame
print("\nColumns in the DataFrame:")
print(repos_df.columns)

# Check for missing values in the 'license_name' column
print("\nMissing values in 'license_name' column:")
print(repos_df['license_name'].isna().sum())

# Filter out missing licenses
repos_df = repos_df[repos_df['license_name'].notna()]

# Check the first few rows after filtering to ensure the filter worked
print("\nFirst few rows after filtering out missing licenses:")
print(repos_df.head())

# Count the occurrences of each license
license_counts = repos_df['license_name'].value_counts()

# Check the license counts to ensure they are calculated correctly
print("\nLicense counts:")
print(license_counts)

# Extract the top 3 most popular licenses
top_3_licenses = license_counts.head(3)

# List the license names in a comma-separated format
top_3_license_names = ','.join(top_3_licenses.index.tolist())

print(f"\n3 most popular licenses among these users: {top_3_license_names}")


First few rows of the DataFrame:
           login                          full_name            created_at  \
0  mosh-hamedani    mosh-hamedani/angular-education  2017-08-10T05:30:30Z   
1  mosh-hamedani      mosh-hamedani/angular2-course  2016-02-15T22:59:37Z   
2  mosh-hamedani         mosh-hamedani/angularfire2  2016-09-06T00:06:45Z   
3  mosh-hamedani  mosh-hamedani/AngularJS2-Learning  2017-08-10T05:41:03Z   
4  mosh-hamedani      mosh-hamedani/awesome-angular  2017-08-10T05:45:49Z   

   stargazers_count  watchers_count    language  has_projects  has_wiki  \
0                22              22         NaN          True      True   
1               130             130  TypeScript          True      True   
2                15              15  TypeScript          True      True   
3                28              28         NaN          True      True   
4                35              35        HTML          True      True   

  license_name  
0          NaN  
1          NaN  
2 

In [None]:
#q4
import pandas as pd

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv'
users_df = pd.read_csv(url)

# Clean the company names
def clean_company(company):
    if pd.isna(company):
        return ''
    return company.strip().lstrip('@').upper()

users_df['company'] = users_df['company'].apply(clean_company)

# Filter out empty company names
filtered_users_df = users_df[users_df['company'] != '']

# Count the occurrences of each company
company_counts = filtered_users_df['company'].value_counts()

# Find the company with the maximum count
if not company_counts.empty:
    most_common_company = company_counts.idxmax()
    most_common_company_count = company_counts.max()
    print(f"The majority of these developers work at: {most_common_company} with {most_common_company_count} developers")
else:
    print("No non-empty company names found.")

The majority of these developers work at: MONASH UNIVERSITY with 8 developers


In [None]:
#q5
import pandas as pd

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/repositories.csv'
repos_df = pd.read_csv(url)

# Check the first few rows of the DataFrame to understand its structure
print("First few rows of the DataFrame:")
print(repos_df.head())

# Filter out missing languages
repos_df = repos_df[repos_df['language'].notna()]

# Count the occurrences of each language
language_counts = repos_df['language'].value_counts()

# Check the language counts to ensure they are calculated correctly
print("\nLanguage counts:")
print(language_counts)

# Find the most popular language
if not language_counts.empty:
    most_popular_language = language_counts.idxmax()
    most_popular_language_count = language_counts.max()
    print(f"\nThe most popular programming language among these users is: {most_popular_language} with {most_popular_language_count} repositories")
else:
    print("\nNo programming languages found.")

First few rows of the DataFrame:
           login                          full_name            created_at  \
0  mosh-hamedani    mosh-hamedani/angular-education  2017-08-10T05:30:30Z   
1  mosh-hamedani      mosh-hamedani/angular2-course  2016-02-15T22:59:37Z   
2  mosh-hamedani         mosh-hamedani/angularfire2  2016-09-06T00:06:45Z   
3  mosh-hamedani  mosh-hamedani/AngularJS2-Learning  2017-08-10T05:41:03Z   
4  mosh-hamedani      mosh-hamedani/awesome-angular  2017-08-10T05:45:49Z   

   stargazers_count  watchers_count    language  has_projects  has_wiki  \
0                22              22         NaN          True      True   
1               130             130  TypeScript          True      True   
2                15              15  TypeScript          True      True   
3                28              28         NaN          True      True   
4                35              35        HTML          True      True   

  license_name  
0          NaN  
1          NaN  
2 

In [None]:
#q6
import pandas as pd

# Load the data from the CSV files in the GitHub repository
users_url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv'
repos_url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/repositories.csv'
users_df = pd.read_csv(users_url)
repos_df = pd.read_csv(repos_url)

# Convert the 'created_at' column to datetime format
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Filter users who joined after 2020
users_after_2020 = users_df[users_df['created_at'] > '2020-12-31']

# Get the logins of these users
logins_after_2020 = users_after_2020['login'].tolist()

# Filter repositories by these users
repos_after_2020 = repos_df[repos_df['login'].isin(logins_after_2020)]

# Filter out missing languages
repos_after_2020 = repos_after_2020[repos_after_2020['language'].notna()]

# Count the occurrences of each language
language_counts = repos_after_2020['language'].value_counts()

# Check the language counts to ensure they are calculated correctly
print("\nLanguage counts:")
print(language_counts)

# Find the second most popular language
if len(language_counts) > 1:
    second_most_popular_language = language_counts.index[1]
    second_most_popular_language_count = language_counts.iloc[1]
    print(f"\nThe second most popular programming language among users who joined after 2020 is: {second_most_popular_language} with {second_most_popular_language_count} repositories")
else:
    print("\nNot enough data to determine the second most popular programming language.")


Language counts:
language
Python              35
JavaScript          18
HTML                 6
Astro                5
C++                  4
C#                   4
TypeScript           4
Java                 3
CSS                  2
Rust                 2
Vue                  2
MATLAB               1
SCSS                 1
Matlab               1
PHP                  1
Ruby                 1
Kotlin               1
Makefile             1
Go                   1
Shell                1
ShaderLab            1
Jupyter Notebook     1
Name: count, dtype: int64

The second most popular programming language among users who joined after 2020 is: JavaScript with 18 repositories


In [None]:
#q7
import pandas as pd

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/repositories.csv'
repos_df = pd.read_csv(url)

# Filter out missing languages
repos_df = repos_df[repos_df['language'].notna()]

# Group by language and calculate the average number of stars
average_stars_per_language = repos_df.groupby('language')['stargazers_count'].mean()

# Find the language with the highest average number of stars
if not average_stars_per_language.empty:
    highest_avg_stars_language = average_stars_per_language.idxmax()
    highest_avg_stars_value = average_stars_per_language.max()
    print(f"The language with the highest average number of stars per repository is: {highest_avg_stars_language} with an average of {highest_avg_stars_value:.2f} stars")
else:
    print("No programming languages found.")

The language with the highest average number of stars per repository is: D with an average of 2521.50 stars


In [None]:
#q8
import pandas as pd

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv'
users_df = pd.read_csv(url)

# Calculate leader_strength as followers / (1 + following)
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort the DataFrame by leader_strength in descending order
sorted_users_df = users_df.sort_values(by='leader_strength', ascending=False)

# Extract the top 5 users
top_5_users = sorted_users_df.head(5)

# List their logins in a comma-separated format
top_5_logins = ','.join(top_5_users['login'].tolist())

print(f"Top 5 users in terms of leader_strength: {top_5_logins}")

Top 5 users in terms of leader_strength: mosh-hamedani,binarythistle,TheCherno,TuPayChain,rogerclarkmelbourne


In [None]:

#q9
import pandas as pd

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv'
users_df = pd.read_csv(url)

# Check for missing values in the 'followers' and 'public_repos' columns
print("Missing values in 'followers' column:", users_df['followers'].isna().sum())
print("Missing values in 'public_repos' column:", users_df['public_repos'].isna().sum())

# Drop rows with missing values in 'followers' or 'public_repos' columns
users_df = users_df.dropna(subset=['followers', 'public_repos'])

# Check the first few rows of the DataFrame to understand its structure
print("\nFirst few rows of the DataFrame after dropping missing values:")
print(users_df[['followers', 'public_repos']].head())

# Calculate the correlation between the number of followers and the number of public repositories
correlation = users_df['followers'].corr(users_df['public_repos'])

# Print the correlation value rounded to three decimal places
print(f"\nCorrelation between followers and public repositories: {correlation:.3f}")

Missing values in 'followers' column: 0
Missing values in 'public_repos' column: 0

First few rows of the DataFrame after dropping missing values:
   followers  public_repos
0      12786            27
1      11234            35
2       9391           372
3       6435           562
4       4578            60

Correlation between followers and public repositories: 0.187


In [None]:
#q10
import pandas as pd
from sklearn.linear_model import LinearRegression

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv'
users_df = pd.read_csv(url)

# Check for missing values in the 'followers' and 'public_repos' columns
print("Missing values in 'followers' column:", users_df['followers'].isna().sum())
print("Missing values in 'public_repos' column:", users_df['public_repos'].isna().sum())

# Drop rows with missing values in 'followers' or 'public_repos' columns
users_df = users_df.dropna(subset=['followers', 'public_repos'])

# Prepare the data for regression
X = users_df[['public_repos']].values
y = users_df['followers'].values

# Perform linear regression
model = LinearRegression()
model.fit(X, y)

# Extract the regression slope
slope = model.coef_[0]

# Print the regression slope rounded to three decimal places
print(f"Regression slope of followers on repos: {slope:.3f}")

Missing values in 'followers' column: 0
Missing values in 'public_repos' column: 0
Regression slope of followers on repos: 2.239


In [None]:
#q11
import pandas as pd

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/repositories.csv'
repos_df = pd.read_csv(url)

# Ensure the 'has_projects' and 'has_wiki' columns are in boolean format
repos_df['has_projects'] = repos_df['has_projects'].astype(bool)
repos_df['has_wiki'] = repos_df['has_wiki'].astype(bool)

# Calculate the correlation between 'has_projects' and 'has_wiki'
correlation = repos_df['has_projects'].corr(repos_df['has_wiki'])

# Print the correlation value rounded to three decimal places
print(f"Correlation between having projects enabled and having wiki enabled: {correlation:.3f}")

Correlation between having projects enabled and having wiki enabled: 0.378


In [None]:
#q12
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv')

# Filter hireable and non-hireable users
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

# Calculate average following for both groups
average_hireable_following = hireable_users['following'].mean()
average_non_hireable_following = non_hireable_users['following'].mean()

# Calculate the difference
difference = average_hireable_following - average_non_hireable_following

# Print the result rounded to three decimal places
print(f'Difference in average following (hireable - non-hireable): {difference:.3f}')

Difference in average following: nan


In [None]:
#q13
import pandas as pd

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv'
users_df = pd.read_csv(url)

# Filter out users without bios
users_with_bios = users_df[users_df['bio'].notna() & (users_df['bio'] != '')]

# Calculate the length of each bio in Unicode words (split by whitespace)
users_with_bios['bio_length'] = users_with_bios['bio'].apply(lambda x: len(x.split()))

# Calculate the correlation between the length of the bio and the number of followers
correlation = users_with_bios['bio_length'].corr(users_with_bios['followers'])

# Print the correlation value rounded to three decimal places
print(f"Correlation between bio length and followers: {correlation:.3f}")

Correlation between bio length and followers: 0.048


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_bios['bio_length'] = users_with_bios['bio'].apply(lambda x: len(x.split()))


In [None]:
#q14
import pandas as pd

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/repositories.csv'
repos_df = pd.read_csv(url)

# Convert the 'created_at' column to datetime format
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'])

# Filter repositories created on weekends (Saturday and Sunday)
repos_df['weekday'] = repos_df['created_at'].dt.weekday
weekend_repos = repos_df[repos_df['weekday'] >= 5]

# Count the number of repositories created by each user on weekends
weekend_repo_counts = weekend_repos['login'].value_counts()

# Extract the top 5 users
top_5_users = weekend_repo_counts.head(5)

# List their logins in a comma-separated format
top_5_logins = ','.join(top_5_users.index.tolist())

print(f"Top 5 users who created the most repositories on weekends: {top_5_logins}")

Top 5 users who created the most repositories on weekends: roachhd,wolfeidau,karkranikhil,rstacruz,plutext


In [None]:
#q15
import pandas as pd

# Load the users data from the CSV file
users_df = pd.read_csv('https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv')

# Total number of users
total_users = len(users_df)

# Filter hireable and non-hireable users
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'].isna() | (users_df['hireable'] == False)]

# Calculate the fraction of users with email in both groups
fraction_hireable_with_email = hireable_users['email'].notna().mean()
fraction_non_hireable_with_email = non_hireable_users['email'].notna().mean()

# Calculate the difference
difference = fraction_hireable_with_email - fraction_non_hireable_with_email

# Print the result rounded to three decimal places
print(f'Difference in fraction of users with email: {difference:.3f}')

Difference in fraction of users with email: nan


In [None]:
#q16
import pandas as pd

# Load the data from the CSV file in the GitHub repository
url = 'https://raw.githubusercontent.com/AlexStark110/MELB_USERS/refs/heads/main/users.csv'
users_df = pd.read_csv(url)

# Filter out users without names
users_with_names = users_df[users_df['name'].notna() & (users_df['name'] != '')]

# Extract the last word from each name as the surname
users_with_names['surname'] = users_with_names['name'].apply(lambda x: x.strip().split()[-1])

# Count the occurrences of each surname
surname_counts = users_with_names['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Find the most common surname(s)
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

# Sort the most common surnames alphabetically
most_common_surnames.sort()

# Print the most common surname(s) in a comma-separated format
print(f"Most common surname(s): {', '.join(most_common_surnames)}")

Most common surname(s): Jackson, Wang


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_names['surname'] = users_with_names['name'].apply(lambda x: x.strip().split()[-1])
