In [None]:
# Goals: to find 800 repository and get ML to classify its language

# ACQUIRE

In [30]:
# import
import requests
import json
from bs4 import BeautifulSoup

In [31]:
# set up API request headers
from env import github_token, github_username

headers = {
    "Authorization": f"token {github_token}",
    "User-Agent": github_username
}

In [32]:
# make API requests to get repository names
language = "python"
api_url = f"https://api.github.com/search/repositories?q=language:{language}&sort=stars&order=desc&per_page=100"
response = requests.get(api_url, headers=headers)

if response.status_code == 200:
    repositories = response.json()["items"]
else:
    print(f"Failed to fetch repository list. Status code: {response.status_code}")
    exit()

Goal: add all these results to a list, then find a way to go through and request another 10 results with always adding them to a list. clean up emphasis text from entries

In [33]:
import requests
import json
from env import github_token, github_username

# Define the headers with your token and username
headers = {
    "Authorization": f"token {github_token}",
    "User-Agent": github_username
}

# Define a function to get repositories based on a query and page
def get_repositories(query, sort="stars", order="desc", page=1):
    base_url = "https://api.github.com/search/repositories"
    params = {
        "q": query,
        "sort": sort,
        "order": order,
        "per_page": 10,  # You can get up to 100 results per page.
        "page": page  # Specify the page number
    }
    response = requests.get(base_url, headers=headers, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return None

# Search for breast cancer related repositories and handle pagination
query = "breast cancer"
page = 1

# Create a list to collect the repository data
repository_data = []

while page <= 10:
    response_json = get_repositories(query, page=page)
    if response_json is None:
        print(f"Failed to fetch page {page} of repositories.")
        break

    # Extract and collect repository names
    for repo in response_json["items"]:
        repository_data.append({
            "full_name": repo["full_name"]
        })

    # Check if there are more pages
    if "Link" in response.headers:
        next_link = response.headers["Link"]
        if 'rel="next"' not in next_link:
            break
    else:
        break
    page += 1


In [34]:
# ... (Previous code remains the same)

# Extract only the repository names from the dictionaries
repository_names = [repo_data["full_name"] for repo_data in repository_data]

# Save the repository names list to a JSON file
with open("breast_cancer_repository.json", "w") as json_file:
    json.dump(repository_names, json_file, indent=2)

print("Data saved to data2.json")


Data saved to data2.json


In [35]:
df = pd.read_json('data2.json')
df

Unnamed: 0,0
0,nyukat/breast_cancer_classifier
1,lishen/end2end-all-conv
2,ImagingLab/ICIAR2018
3,Jean-njoroge/Breast-cancer-risk-prediction
4,abhinavsagar/breast-cancer-classification
...,...
95,BishalDali/Breast_Cancer_Prediction
96,IndianAIProduction-Channel/Breast-Cancer-Detec...
97,gmineo/Breast-Cancer-Prediction-Project
98,iharnoor/BreastCancer-Kmeans


# Prep phase

In [36]:
import unicodedata
import re
import json


import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd



In [51]:
df = df.rename(columns={0: 'repository'})


In [50]:
df['repository'] = df['repository'].str.lower()
df

Unnamed: 0,repository
0,nyukat/breast_cancer_classifier
1,lishen/end2end-all-conv
2,imaginglab/iciar2018
3,jean-njoroge/breast-cancer-risk-prediction
4,abhinavsagar/breast-cancer-classification
...,...
95,bishaldali/breast_cancer_prediction
96,indianaiproduction-channel/breast-cancer-detec...
97,gmineo/breast-cancer-prediction-project
98,iharnoor/breastcancer-kmeans


In [53]:
# Apply the normalization and cleaning operations to the 'repository' column
df['repository'] = df['repository'].apply(lambda x: unicodedata.normalize('NFKD', x)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore'))

# Now, the 'repository' column contains cleaned text with non-ASCII characters removed
print(df)

                                           repository
0                     nyukat/breast_cancer_classifier
1                             lishen/end2end-all-conv
2                                imaginglab/iciar2018
3          jean-njoroge/breast-cancer-risk-prediction
4           abhinavsagar/breast-cancer-classification
..                                                ...
95                bishaldali/breast_cancer_prediction
96  indianaiproduction-channel/breast-cancer-detec...
97            gmineo/breast-cancer-prediction-project
98                       iharnoor/breastcancer-kmeans
99            sagnikghoshcr7/breast-cancer-prediction

[100 rows x 1 columns]


In [56]:
# Apply the regex substitution to the 'text' column
df['repository'] = df['repository'].apply(lambda x: re.sub(r"[^a-z0-9'\s]", '', x))

# Now, the 'text' column contains cleaned text with unwanted characters removed
print(df)

                                           repository
0                        nyukatbreastcancerclassifier
1                                lishenend2endallconv
2                                 imaginglabiciar2018
3               jeannjorogebreastcancerriskprediction
4              abhinavsagarbreastcancerclassification
..                                                ...
95                   bishaldalibreastcancerprediction
96  indianaiproductionchannelbreastcancerdetectionapp
97                gmineobreastcancerpredictionproject
98                         iharnoorbreastcancerkmeans
99               sagnikghoshcr7breastcancerprediction

[100 rows x 1 columns]


In [57]:
# tokenization

tokenizer = nltk.tokenize.ToktokTokenizer()

print(tokenizer.tokenize(df, return_str=True)[0:500])

repository
0 nyukatbreastcancerclassifier
1 lishenend2endallconv
2 imaginglabiciar2018
3 jeannjorogebreastcancerriskprediction
4 abhinavsagarbreastcancerclassification
 .. ... 
95 bishaldalibreastcancerprediction
96 indianaiproductionchannelbreastcancerdetectionapp
97 gmineobreastcancerpredictionproject
98 iharnoorbreastcancerkmeans
99 sagnikghoshcr7breastcancerprediction

[ 100 rows x 1 columns ]
