# Data Scarping

In [None]:
import requests
import pandas as pd
import time
import logging
from typing import List, Dict, Optional

class GitHubScraper:
    def __init__(self, token: str):
        """
        Initializes GitHub API access with the provided token.

        Args:
            token (str): GitHub Personal Access Token.
        """
        self.auth_headers = {
            'Authorization': f'Bearer {token}',
            'Accept': 'application/vnd.github.v3+json'
        }
        self.base_api_url = 'https://api.github.com'

        # Set up logging for process tracking
        logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
        self.logger = logging.getLogger(self.__class__.__name__)

    def request_data(self, endpoint: str, query_params: Optional[Dict] = None) -> Optional[Dict]:
        """
        Handle requests with retry logic for GitHub's rate limits.

        Args:
            endpoint (str): API endpoint URL.
            query_params (dict, optional): Parameters for GET request.

        Returns:
            dict or None: Parsed JSON response, or None on error.
        """
        while True:
            response = requests.get(endpoint, headers=self.auth_headers, params=query_params)
            if response.ok:
                return response.json()
            elif response.status_code == 403:
                reset_time = int(response.headers.get('X-RateLimit-Reset', time.time()))
                wait_time = max(reset_time - int(time.time()), 0) + 1
                self.logger.warning(f"Rate limit exceeded. Waiting for {wait_time} seconds.")
                time.sleep(wait_time)
            else:
                self.logger.error(f"Request failed: {response.status_code} - {response.text}")
                return None

    def normalize_company_name(self, company_name: Optional[str]) -> str:
        """
        Normalize company name by removing '@' and capitalizing it.

        Args:
            company_name (str): Original company name.

        Returns:
            str: Normalized company name.
        """
        if not company_name:
            return ""
        return company_name.strip('@').upper()

    def find_users(self, location: str, min_followers: int) -> List[Dict]:
        """
        Find GitHub users by location and minimum follower count.

        Args:
            location (str): Search location.
            min_followers (int): Minimum follower threshold.

        Returns:
            list[dict]: List of user data dictionaries.
        """
        users_found, current_page = [], 1
        search_url = f"{self.base_api_url}/search/users"

        while True:
            self.logger.info(f"Searching users, page {current_page}")
            params = {'q': f"location:{location} followers:>{min_followers}", 'per_page': 100, 'page': current_page}
            result = self.request_data(search_url, params)

            if not result or 'items' not in result:
                break

            for user in result['items']:
                user_detail = self.request_data(user['url'])
                if user_detail:
                    users_found.append({
                        'login': user_detail.get('login', ''),
                        'name': user_detail.get('name', ''),
                        'company': self.normalize_company_name(user_detail.get('company')),
                        'location': user_detail.get('location', ''),
                        'email': user_detail.get('email', ''),
                        'hireable': user_detail.get('hireable', False),
                        'bio': user_detail.get('bio', ''),
                        'public_repos': user_detail.get('public_repos', 0),
                        'followers': user_detail.get('followers', 0),
                        'following': user_detail.get('following', 0),
                        'created_at': user_detail.get('created_at', '')
                    })

            if len(result['items']) < 100:
                break
            current_page += 1

        return users_found

    def fetch_user_repos(self, username: str, limit: int = 500) -> List[Dict]:
        """
        Fetch a user's repositories.

        Args:
            username (str): GitHub username.
            limit (int, optional): Max number of repos to fetch.

        Returns:
            list[dict]: List of repository data dictionaries.
        """
        user_repos, page = [], 1
        repo_url = f"{self.base_api_url}/users/{username}/repos"

        while len(user_repos) < limit:
            self.logger.info(f"Fetching repositories for {username}, page {page}")
            params = {'sort': 'pushed', 'direction': 'desc', 'per_page': 100, 'page': page}
            repositories = self.request_data(repo_url, params)

            if not repositories:
                break

            for repo in repositories:
                if len(user_repos) >= limit:
                    break
                user_repos.append({
                    'login': username,
                    'full_name': repo.get('full_name', ''),
                    'created_at': repo.get('created_at', ''),
                    'stargazers_count': repo.get('stargazers_count', 0),
                    'watchers_count': repo.get('watchers_count', 0),
                    'language': repo.get('language', ''),
                    'has_projects': repo.get('has_projects', False),
                    'has_wiki': repo.get('has_wiki', False),
                    'license_name': repo.get('license', {}).get('key', '') if repo.get('license') else ''
                })

            if len(repositories) < 100:
                break
            page += 1

        return user_repos

def main():
    # Prompt for GitHub token
    token = input("Please provide your GitHub token: ").strip()
    if not token:
        print("Token required to proceed.")
        return

    scraper = GitHubScraper(token)
    # Prompt for location
    location = input("Please provide the location(make sure to give correct spelling): ").strip()
    if not location:
        print("Location required to proceed.")
        return
    #Prompt for followers
    min_followers = int(input("Please provide the minimum number of followers: ").strip())
    if not min_followers:
        print("Minimum followers required to proceed.")
        return
    # Search for users in Beijing with >500 followers
    users = scraper.find_users(location, min_followers)
    users_df = pd.DataFrame(users)
    users_df.to_csv('users.csv', index=False)

    # Gather repositories for each user found
    all_repositories = []
    for user in users:
        repos = scraper.fetch_user_repos(user['login'])
        all_repositories.extend(repos)

    repos_df = pd.DataFrame(all_repositories)
    repos_df.to_csv('repositories.csv', index=False)

    print(f"Collected {len(users)} users and {len(all_repositories)} repositories.")

if __name__ == "__main__":
    main()


# Data Cleaning

In [1]:
import pandas as pd
import numpy as np

In [3]:
users=pd.read_csv('/content/users.csv')
repos=pd.read_csv("/content/repositories.csv")

In [4]:
users['hireable']=users['hireable'].replace({True:'true',np.nan:''})
users.replace(np.nan,'',inplace=True)
users.to_csv('/content/users.csv',index=False)

In [6]:
repos.replace(np.nan,'',inplace=True)
repos[['has_projects', 'has_wiki']] = repos[['has_projects', 'has_wiki']].replace({True: 'true', False: 'false'})
repos.to_csv('/content/repositories.csv',index=False)

# Assignment

#Ques 1

In [7]:
sorted_users = users.sort_values('followers', ascending=False )
",".join(sorted_users["login"].iloc[:5].tolist())

'AlexGyver,carlcastanas,sergeyshaykhullin,alexey-goloburdin,richardroberti'

#Ques2

In [8]:
sorted_users = users.sort_values('created_at')
",".join(sorted_users["login"].iloc[:5].tolist())

'maxlapshin,veged,alexeyr,alec-c4,alno'

#Ques3

In [10]:
users_with_licenses = repos.dropna(subset=['license_name'])
license_counts = users_with_licenses['license_name'].value_counts()
top_3_licenses = license_counts.nlargest(4).index.tolist()[1:4]

print(','.join(top_3_licenses))

mit,other,apache-2.0


#Ques4

In [11]:
top_comp=users["company"].value_counts(dropna=False).index
top_comp[1]

'YANDEX'

#Ques5

In [12]:
repos['language'].value_counts(dropna=False).index[1]


'JavaScript'

#Ques6

In [13]:
users['created_at']=pd.to_datetime(users['created_at'])
after2020=users[users['created_at'].dt.year > 2019]
repos[repos['login'].isin(after2020['login'])]['language'].value_counts(dropna=False).index[2]

'Python'

#Ques7

In [14]:
language_avg_stars = repos.groupby('language')['stargazers_count'].mean()
highest_avg_stars_language = language_avg_stars.idxmax()
highest_avg_stars_language

'Pascal'

#Ques8

In [15]:
users['user_strength']=users['followers'].div(1+users['following'])
",".join(users.iloc[users['user_strength'].nlargest(5).index]['login'].tolist())

'AlexGyver,alexey-goloburdin,yandex,yandexdataschool,esokolov'

#Ques9

In [16]:
cor=users[['followers','public_repos']].corr()
cor

Unnamed: 0,followers,public_repos
followers,1.0,0.051138
public_repos,0.051138,1.0


In [17]:
cor['followers']['public_repos'].round(3)

0.051

#Ques10

In [18]:
import statsmodels.formula.api as sm

model = sm.ols('followers ~ public_repos', data=users).fit()

additional_followers_per_repo = model.params['public_repos']

additional_followers_per_repo.round(3)

0.209

Ques11

In [19]:
has_projects=repos['has_projects'].map({'true':1,'false':0})
has_wiki=repos['has_wiki'].map({'true':1,'false':0})
cor=has_projects.corr(has_wiki)
cor.round(3)


0.345

#Ques12

In [20]:
avg_following_hireable = users[users['hireable'] =='true']['following'].mean()
avg_following_non_hireable = users[users['hireable']!='true']['following'].mean()
difference = avg_following_hireable - avg_following_non_hireable
difference.round(3)

-29.407

#Ques13

In [21]:
users['bio_word_count'] = users['bio'].fillna('').str.split().str.len()
users_with_bios = users[users['bio_word_count'] > 0]
correlation = users_with_bios['bio_word_count'].corr(users_with_bios['followers'])
model = sm.ols('followers ~ bio_word_count', data=users_with_bios).fit()
slope = model.params['bio_word_count']

slope.round(3)

0.523

#Ques14

In [22]:
top5=repos[pd.to_datetime(repos['created_at']).dt.dayofweek.isin([5,6])]['login'].value_counts().nlargest(5).index
','.join(top5)

'VN0,kuggaa,reverse-ex,Mirocow,vslinko'

#Ques15

In [23]:

hireable_users = users[users['hireable'] == 'true']
fraction_email_hireable = np.mean(hireable_users['email'] != '')

non_hireable_users = users[users['hireable'] != 'true']
fraction_email_non_hireable = np.mean(non_hireable_users['email'] != '')
difference = fraction_email_hireable - fraction_email_non_hireable
print(round(difference, 3))

0.189


#Ques16

In [24]:

users['surname'] = users['name'].str.strip().str.split().str[-1]
surname_counts = users['surname'].value_counts()
max_count = surname_counts.max()
most_common_surnames = surname_counts[surname_counts == max_count].index.tolist()
most_common_surnames.sort()
print(','.join(most_common_surnames))




Romanov


# Answers

In [None]:
#1    AlexGyver,carlcastanas,sergeyshaykhullin,alexey-goloburdin,richardroberti
#2    maxlapshin,veged,alexeyr,alec-c4,alno
#3    mit,other,apache-2.0
#4    YANDEX
#5    JavaScript
#6    Python
#7    Pascal
#8    AlexGyver,alexey-goloburdin,yandex,yandexdataschool,esokolov
#9    0.051
#10   0.209
#11   0.345
#12   -29.407
#13   0.523
#14   VN0,kuggaa,reverse-ex,Mirocow,vslinko
#15   0.189
#16   Romanov