In [None]:
import requests
import pandas as pd
import time

# GitHub API Token and Headers (replace with your token here)
GITHUB_TOKEN = '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'  # Replace with your GitHub token
HEADERS = {'Authorization': f'token {GITHUB_TOKEN}'}


In [None]:
# Fetch users in Hyderabad with more than 50 followers
def fetch_users():
    users_data = []
    page = 1

    while True:
        url = f'https://api.github.com/search/users?q=location:Hyderabad+followers:>50&page={page}&per_page=100'
        response = requests.get(url, headers=HEADERS).json()

        if 'items' not in response:
            break  # No more users

        for user in response['items']:
            user_details = fetch_user_details(user['login'])
            if user_details:
                users_data.append(user_details)
        page += 1
        time.sleep(2)  # Respect GitHub's rate limits
    return pd.DataFrame(users_data)

# Fetch individual user details
def fetch_user_details(username):
    url = f'https://api.github.com/users/{username}'
    response = requests.get(url, headers=HEADERS).json()

    if 'message' in response:
        return None

    return {
        'login': response.get('login', ''),
        'name': response.get('name', ''),
        'company': clean_company_name(response.get('company', '')),
        'location': response.get('location', ''),
        'email': response.get('email', ''),
        'hireable': response.get('hireable', ''),
        'bio': response.get('bio', ''),
        'public_repos': response.get('public_repos', 0),
        'followers': response.get('followers', 0),
        'following': response.get('following', 0),
        'created_at': response.get('created_at', '')
    }

# Clean the company names
def clean_company_name(company):
    if company:
        company = company.strip().lstrip('@').upper()
    return company

# Fetch user data and save to users.csv
users_df = fetch_users()
users_df.to_csv('/content/users.csv', index=False)  # Save to Colab's file system
print("Users data saved to users.csv")


Users data saved to users.csv


In [None]:
# Fetch repositories for each user in users.csv
def fetch_repositories(username):
    repos_data = []
    page = 1

    while True:
        url = f'https://api.github.com/users/{username}/repos?page={page}&per_page=100'
        response = requests.get(url, headers=HEADERS).json()

        if not response or 'message' in response:
            break  # No more repositories or API limit hit

        for repo in response:
            repos_data.append({
                'login': username,
                'full_name': repo.get('full_name', ''),
                'created_at': repo.get('created_at', ''),
                'stargazers_count': repo.get('stargazers_count', 0),
                'watchers_count': repo.get('watchers_count', 0),
                'language': repo.get('language', ''),
                'has_projects': repo.get('has_projects', False),
                'has_wiki': repo.get('has_wiki', False),
                'license_name': repo.get('license', {}).get('key', '') if repo.get('license') else ''
            })
        page += 1
        if page > 5:  # Limit to 500 repositories (5 pages * 100)
            break
    return repos_data


# Iterate through users.csv and collect repository data
repositories_data = []
for username in users_df['login']:
    user_repos = fetch_repositories(username)
    repositories_data.extend(user_repos)
    time.sleep(1)  # Respect rate limits

# Save repositories data to CSV
repos_df = pd.DataFrame(repositories_data)
repos_df.to_csv('/content/repositories.csv', index=False)
print("Repositories data saved to repositories.csv")


Repositories data saved to repositories.csv


In [None]:
# Save README.md file with required content
readme_content = """
# Hyderabad GitHub Users Analysis

- **Data Collection**: Used GitHub API to scrape user profiles in Hyderabad with over 50 followers, retrieving user and repository data.
- **Interesting Finding**: Hyderabad GitHub users frequently contribute to open-source repositories in various languages, with many showing high follower counts, indicating active communities.
- **Recommendation**: Developers in Hyderabad can increase visibility by leveraging open-source contributions to gain followers and engage in active projects.

This project includes Python scripts and CSV files `users.csv` and `repositories.csv` for the gathered data.
"""

with open('/content/README.md', 'w') as file:
    file.write(readme_content)

print("README.md created successfully.")


README.md created successfully.


#Q1

In [1]:
import pandas as pd

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Sort by 'followers' in descending order and select the top 5 users
top_users = users_df.sort_values(by='followers', ascending=False).head(5)

# Extract the 'login' values of these users and join them as a comma-separated string
top_user_logins = ', '.join(top_users['login'])

# Display the result
print("Top 5 users in Hyderabad with the highest number of followers:", top_user_logins)


Top 5 users in Hyderabad with the highest number of followers: iam-veeramalla, in28minutes, stacksimplify, thenaveensaggam, MadhavBahl


#Q2

In [None]:
import pandas as pd

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Convert 'created_at' column to datetime for accurate sorting
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Sort by 'created_at' and get the first 5 users
earliest_users = users_df.sort_values(by='created_at').head(5)

# Extract the 'login' values of these users and join them as a comma-separated string
earliest_logins = ', '.join(earliest_users['login'])

# Display the result
print("Earliest registered GitHub users in Hyderabad:", earliest_logins)


Earliest registered GitHub users in Hyderabad: shabda, sitaramc, bagwanpankaj, srikanthlogic, kulbirsaini


#Q3

In [None]:
import pandas as pd

# Load repositories.csv
repos_df = pd.read_csv('/content/repositories.csv')

# Filter out rows where 'license_name' is empty
repos_with_licenses = repos_df[repos_df['license_name'].notna() & (repos_df['license_name'] != '')]

# Count the occurrences of each license and get the top 3 most common ones
top_licenses = repos_with_licenses['license_name'].value_counts().head(3)

# Format the result as a comma-separated string
top_license_names = ', '.join(top_licenses.index)

# Display the result
print("Top 3 most popular licenses:", top_license_names)


Top 3 most popular licenses: mit, apache-2.0, other


#Q4

In [None]:
import pandas as pd

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Function to clean up company names
def clean_company_name(company):
    if pd.notna(company):  # Only clean if the company is not NaN
        company = company.strip().lstrip('@').upper()  # Trim whitespace, strip @, and convert to uppercase
    return company

# Apply the cleaning function to the 'company' column
users_df['company'] = users_df['company'].apply(clean_company_name)

# Filter out rows with empty 'company' values after cleaning
users_with_company = users_df[users_df['company'] != '']

# Count occurrences of each company and get the one with the highest count
top_company = users_with_company['company'].value_counts().idxmax()

# Display the result
print("The company with the most developers:", top_company)


The company with the most developers: IIIT HYDERABAD


#Q5

In [None]:
import pandas as pd

# Load repositories.csv
repos_df = pd.read_csv('/content/repositories.csv')

# Filter out rows with missing or empty 'language' values
repos_with_language = repos_df[repos_df['language'].notna() & (repos_df['language'] != '')]

# Count occurrences of each language and get the most common one
most_popular_language = repos_with_language['language'].value_counts().idxmax()

# Display the result
print("The most popular programming language among these users:", most_popular_language)


The most popular programming language among these users: JavaScript


#Q6

In [None]:
import pandas as pd

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Convert 'created_at' to datetime for filtering
users_df['created_at'] = pd.to_datetime(users_df['created_at'])

# Filter users who joined after 2020
recent_users = users_df[users_df['created_at'] > '2020-01-01']

# Load repositories.csv
repos_df = pd.read_csv('/content/repositories.csv')

# Filter repositories owned by the recent users
recent_user_logins = recent_users['login'].unique()
recent_repos = repos_df[repos_df['login'].isin(recent_user_logins)]

# Filter out rows with missing or empty 'language' values
recent_repos_with_language = recent_repos[recent_repos['language'].notna() & (recent_repos['language'] != '')]

# Count occurrences of each language
language_counts = recent_repos_with_language['language'].value_counts()

# Get the second most popular language
second_most_popular_language = language_counts.index[1] if len(language_counts) > 1 else None

# Display the result
print("The second most popular programming language among users who joined after 2020:", second_most_popular_language)


The second most popular programming language among users who joined after 2020: HTML


#Q7

In [None]:
import pandas as pd

# Load repositories.csv
repos_df = pd.read_csv('/content/repositories.csv')

# Filter out rows with missing or empty 'language' or 'stargazers_count' values
filtered_repos = repos_df[repos_df['language'].notna() & (repos_df['language'] != '') &
                           (repos_df['stargazers_count'].notna())]

# Group by 'language' and calculate the average number of stars per repository
average_stars_per_language = filtered_repos.groupby('language')['stargazers_count'].mean()

# Identify the language with the highest average stars
highest_avg_stars_language = average_stars_per_language.idxmax()
highest_avg_stars_value = average_stars_per_language.max()

# Display the result
print("The language with the highest average number of stars per repository is:", highest_avg_stars_language)
print("Average number of stars:", highest_avg_stars_value)


The language with the highest average number of stars per repository is: Perl
Average number of stars: 200.06976744186048


#Q8

In [None]:
import pandas as pd

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Calculate leader_strength as followers / (1 + following)
users_df['leader_strength'] = users_df['followers'] / (1 + users_df['following'])

# Sort by leader_strength in descending order and get the top 5 users
top_leaders = users_df.sort_values(by='leader_strength', ascending=False).head(5)

# Extract the 'login' values of these users and join them as a comma-separated string
top_leader_logins = ', '.join(top_leaders['login'])

# Display the result
print("Top 5 users in terms of leader_strength:", top_leader_logins)


Top 5 users in terms of leader_strength: in28minutes, iam-veeramalla, stacksimplify, ashokitschool, thenaveensaggam


#Q9

In [None]:
import pandas as pd

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Calculate the correlation between 'followers' and 'public_repos'
correlation = users_df['followers'].corr(users_df['public_repos'])

# Display the result
print("Correlation between the number of followers and the number of public repositories:", correlation)


Correlation between the number of followers and the number of public repositories: 0.0062097018017158645


#Q10

In [None]:
import pandas as pd
import statsmodels.api as sm

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Define the independent variable (public_repos) and dependent variable (followers)
X = users_df['public_repos']
y = users_df['followers']

# Add a constant to the model (for the intercept)
X = sm.add_constant(X)

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the summary of the regression model to get the coefficient of public_repos
print(model.summary())

# Extract the coefficient of 'public_repos' to interpret how many additional followers per repo
additional_followers_per_repo = model.params['public_repos']
print("Estimated additional followers per additional public repository:", additional_followers_per_repo)


                            OLS Regression Results                            
Dep. Variable:              followers   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                   0.01932
Date:                Wed, 30 Oct 2024   Prob (F-statistic):              0.890
Time:                        13:37:18   Log-Likelihood:                -4179.8
No. Observations:                 503   AIC:                             8364.
Df Residuals:                     501   BIC:                             8372.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          201.6671     56.681      3.558   

#Q11

In [None]:
#Q11
import pandas as pd

# Load the data
repositories_df = pd.read_csv('/content/repositories.csv')



# Calculate the correlation directly
correlation = repositories_df['has_projects'].astype(int).corr(repositories_df['has_wiki'].astype(int))

print(f"The correlation between having projects enabled and having a wiki enabled is: {correlation:.3f}")


The correlation between having projects enabled and having a wiki enabled is: 0.173


#Q12

In [4]:
import pandas as pd

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Filter users with non-null hireable status
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'] != True]

# Calculate the average 'following' for hireable users
avg_following_hireable = hireable_users['following'].mean()

# Calculate the average 'following' for non-hireable users
avg_following_non_hireable = non_hireable_users['following'].mean()

# Calculate the difference and round to 3 decimal places
difference = round(avg_following_hireable - avg_following_non_hireable, 3)

# Display the result
print("Difference in average following (hireable - non-hireable):", difference)


Difference in average following (hireable - non-hireable): 33.681


#Q13

In [8]:
import pandas as pd
import statsmodels.api as sm

# Load the CSV file
csv_file = 'users.csv'  # Ensure this path is correct

# Load the CSV into a DataFrame
df = pd.read_csv(csv_file)

# Check the first few rows and the data types of the DataFrame
print("DataFrame Overview:")
print(df.head())
print("\nDataFrame Info:")
print(df.info())

# Filter out users without bios
df = df[df['bio'].notnull()]

# Calculate the length of each bio in words
df['bio_word_count'] = df['bio'].str.split().str.len()

# Prepare the independent variable (X) and dependent variable (y)
X = df['bio_word_count']
y = df['followers']  # Adjust the column name as per your dataset

# Add a constant to the independent variable (for the intercept)
X = sm.add_constant(X)
# Fit the regression model
model = sm.OLS(y, X).fit()

# Get the slope (coefficient of the bio_word_count)
slope = model.params['bio_word_count']

# Print the regression slope rounded to three decimal places
print(f"\nRegression slope of followers on bio word count: {slope:.3f}")

DataFrame Overview:
             login                 name                   company  \
0   iam-veeramalla  Abhishek Veeramalla                   RED HAT   
1      in28minutes                  NaN               IN28MINUTES   
2    stacksimplify        STACKSIMPLIFY             STACKSIMPLIFY   
3  thenaveensaggam        NAVEEN SAGGAM  HTTPS://WWW.UIBRAINS.COM   
4       MadhavBahl          MADHAV BAHL                 MICROSOFT   

           location                      email hireable  \
0  Hyderabad, India                        NaN      NaN   
1  Hyderabad, India      in28minutes@gmail.com     True   
2         Hyderabad    stacksimplify@gmail.com      NaN   
3         Hyderabad  thenaveensaggam@gmail.com      NaN   
4  Hyderabad, India     madhavbahl10@gmail.com     True   

                                                 bio  public_repos  followers  \
0  Keep learning, sharing and growing || Principa...            45      16216   
1  Helping 1 Million Learners learn Programming,

#Q14

In [None]:
import pandas as pd

# Load repositories.csv
repos_df = pd.read_csv('/content/repositories.csv')

# Convert 'created_at' to datetime format
repos_df['created_at'] = pd.to_datetime(repos_df['created_at'], utc=True)

# Determine the day of the week for each creation date
repos_df['day_of_week'] = repos_df['created_at'].dt.dayofweek

# Filter repositories created on weekends (Saturday=5, Sunday=6)
weekend_repos = repos_df[repos_df['day_of_week'] >= 5]

# Count the number of weekend repositories created by each user
weekend_repos_count = weekend_repos['login'].value_counts()

# Get the top 5 users who created the most repositories on weekends
top_5_weekend_creators = weekend_repos_count.head(5).index.tolist()

# Display the result as a comma-separated string
top_5_weekend_creators_str = ', '.join(top_5_weekend_creators)
print("Top 5 users who created the most repositories on weekends:", top_5_weekend_creators_str)


Top 5 users who created the most repositories on weekends: hemanth22, anjijava16, wahidKhan74, elevenpassin, Shekharrajak


#Q15

In [7]:
import pandas as pd

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Filter hireable and non-hireable users
hireable_users = users_df[users_df['hireable'] == True]
non_hireable_users = users_df[users_df['hireable'] != True]

# Calculate the fraction of hireable users with an email
if len(hireable_users) > 0:
    fraction_with_email_hireable = hireable_users['email'].notna().mean()
else:
    fraction_with_email_hireable = 0.0

# Calculate the fraction of non-hireable users with an email
if len(non_hireable_users) > 0:
    fraction_with_email_non_hireable = non_hireable_users['email'].notna().mean()
else:
    fraction_with_email_non_hireable = 0.0

# Calculate the difference and round to 3 decimal places
difference = round(fraction_with_email_hireable - fraction_with_email_non_hireable, 3)

# Display the result
print("Difference in fraction of users with email (hireable - non-hireable):", difference)


Difference in fraction of users with email (hireable - non-hireable): 0.256


#Q16

In [6]:
import pandas as pd

# Load users.csv
users_df = pd.read_csv('/content/users.csv')

# Filter out users without names
users_with_names = users_df[users_df['name'].notna()]

# Extract the last word in each name (assumed to be the surname) after trimming whitespace
users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]

# Count occurrences of each surname
surname_counts = users_with_names['surname'].value_counts()

# Find the maximum count
max_count = surname_counts.max()

# Get all surnames with the maximum count (to handle ties)
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()

# Sort the surnames alphabetically and join with commas
most_common_surnames_str = ', '.join(sorted(most_common_surnames))
print("Most common surname(s):", most_common_surnames_str)


Most common surname(s): Kumar


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  users_with_names['surname'] = users_with_names['name'].str.strip().str.split().str[-1]
