# Downloading html content from webpage using request library

In [102]:
import requests

In [103]:
url = 'https://github.com/topics/machine-learning'
url

'https://github.com/topics/machine-learning'

In [104]:
response = requests.get(url)
response.status_code

200

In [105]:
contents = response.text 
with open('ml.html', 'w', encoding='utf-8') as file:
    file.write(contents)

# Fetching required information from html content using BeautifulSoup
- Repo User name
- Repo Name
- Repo Description
- Repo Url
- Repo Stars
- Repo image

In [106]:
from bs4 import BeautifulStoneSoup

In [107]:
with open('./ml.html', 'r', encoding='utf-8') as file:
    html_content = file.read()

In [110]:
soup = BeautifulSoup(html_content, 'html.parser')

- fetching username of repos

In [126]:
h3_tag_user_name = soup.find_all('h3', class_ = 'f3 color-fg-muted text-normal lh-condensed')

In [137]:
usernames = []

for tag in h3_tag_user_name:
    username_tags = tag.find('a')
    if username_tags:
        usernames.append(username_tags.text.strip())

usernames

['tensorflow',
 'huggingface',
 'pytorch',
 'netdata',
 'microsoft',
 'Developer-Y',
 'keras-team',
 'tesseract-ocr',
 'd2l-ai',
 'scikit-learn',
 'binhnguyennus',
 'ageitgey',
 'labmlai',
 'deepfakes',
 'ultralytics',
 'iperov',
 'JuliaLang',
 'Avik-Jain',
 'aymericdamien',
 'LAION-AI']

In [143]:
len(usernames)

20

- fetching repo name

In [138]:
a_tag_repo_name = soup.find_all('a', class_ = 'Link text-bold wb-break-word')

In [142]:
repo_names = []

for name in a_tag_repo_name:
    repo_names.append(name.text.strip())
    
repo_names

['tensorflow',
 'transformers',
 'pytorch',
 'netdata',
 'ML-For-Beginners',
 'cs-video-courses',
 'keras',
 'tesseract',
 'd2l-zh',
 'scikit-learn',
 'awesome-scalability',
 'face_recognition',
 'annotated_deep_learning_paper_implementations',
 'faceswap',
 'yolov5',
 'DeepFaceLab',
 'julia',
 '100-Days-Of-ML-Code',
 'TensorFlow-Examples',
 'Open-Assistant']

In [144]:
len(repo_names)

20

- fetching repo description

In [145]:
p_tag_repo_desc = soup.find_all('p', class_ = 'color-fg-muted mb-0')

In [150]:
repo_description = []

for desc in p_tag_repo_desc:
    repo_description.append(desc.text.strip())
    
repo_description

['An Open Source Machine Learning Framework for Everyone',
 '🤗 Transformers: State-of-the-art Machine Learning for Pytorch, TensorFlow, and JAX.',
 'Tensors and Dynamic neural networks in Python with strong GPU acceleration',
 'The open-source observability platform everyone needs!',
 '12 weeks, 26 lessons, 52 quizzes, classic Machine Learning for all',
 'List of Computer Science courses with video lectures.',
 'Deep Learning for humans',
 'Tesseract Open Source OCR Engine (main repository)',
 '《动手学深度学习》：面向中文读者、能运行、可讨论。中英文版被70多个国家的500多所大学用于教学。',
 'scikit-learn: machine learning in Python',
 'The Patterns of Scalable, Reliable, and Performant Large-Scale Systems',
 "The world's simplest facial recognition api for Python and the command line",
 '🧑\u200d🏫 60 Implementations/tutorials of deep learning papers with side-by-side notes 📝; including transformers (original, xl, switch, feedback, vit, ...), optimizers (adam, adabelief, sophia, ...), gans(cyclegan, stylegan2, ...), 🎮 reinforcement

In [148]:
len(repo_description)

20

- fetching repo url

In [149]:
a_tag_repo_url = soup.find_all('a', class_ = 'Link text-bold wb-break-word')

In [200]:
base_url = 'https://github.com'

In [201]:
repo_urls = []

for url in a_tag_repo_url:
    repo_urls.append(base_url + url['href'])

repo_urls

['https://github.com/tensorflow/tensorflow',
 'https://github.com/huggingface/transformers',
 'https://github.com/pytorch/pytorch',
 'https://github.com/netdata/netdata',
 'https://github.com/microsoft/ML-For-Beginners',
 'https://github.com/Developer-Y/cs-video-courses',
 'https://github.com/keras-team/keras',
 'https://github.com/tesseract-ocr/tesseract',
 'https://github.com/d2l-ai/d2l-zh',
 'https://github.com/scikit-learn/scikit-learn',
 'https://github.com/binhnguyennus/awesome-scalability',
 'https://github.com/ageitgey/face_recognition',
 'https://github.com/labmlai/annotated_deep_learning_paper_implementations',
 'https://github.com/deepfakes/faceswap',
 'https://github.com/ultralytics/yolov5',
 'https://github.com/iperov/DeepFaceLab',
 'https://github.com/JuliaLang/julia',
 'https://github.com/Avik-Jain/100-Days-Of-ML-Code',
 'https://github.com/aymericdamien/TensorFlow-Examples',
 'https://github.com/LAION-AI/Open-Assistant']

- fetching repo stars

In [157]:
span_tag_repo_star = soup.find_all('span', class_ = 'Counter js-social-count')

In [191]:
repo_stars = []

for star in span_tag_repo_star:
    repo_stars.append(star.text)
    
def convert_stars(star):
    if star[-1] == 'k':
        return int(float(star[:-1]) * 1000)
    elif star[-1] == 'M':
        return int(float(star[:-1]) * 1000000)
    return int(star)
    
converted_repo_stars = [convert_stars(star) for star in repo_stars]
converted_repo_stars

[184000,
 129000,
 80400,
 69600,
 68300,
 66000,
 61300,
 59700,
 59300,
 58900,
 56900,
 52400,
 51900,
 49900,
 48600,
 46300,
 45000,
 44000,
 43300,
 36900]

- fetching repo images

In [169]:
img_tag_repo_image = soup.find_all('img', class_ = 'd-block width-full')

In [175]:
repo_images = []

for img in img_tag_repo_image:
    img_tag = img.get('src')
    if img_tag:
        repo_images.append(img_tag)
    
repo_images

['https://repository-images.githubusercontent.com/155220641/a16c4880-a501-11ea-9e8f-646cf611702e',
 'https://repository-images.githubusercontent.com/10744183/8d08ea53-6359-45fe-bc4d-067cfe1673a1',
 'https://repository-images.githubusercontent.com/343965132/549b1a80-c897-11eb-9436-918072d2e0f8',
 'https://repository-images.githubusercontent.com/115478820/109a8e00-283a-11ea-8891-ad7215b06a4c',
 'https://repository-images.githubusercontent.com/290091948/ac5a4b00-3e4b-11eb-948f-8e1ff5bdcc63',
 'https://repository-images.githubusercontent.com/264818686/c9bae91d-ad2d-491c-876f-b6948f1a7c66',
 'https://repository-images.githubusercontent.com/1644196/ddfc1e00-6638-11e9-9b80-0fe7b9aedd72']

# Converting to csv

In [206]:
import pandas as pd

**checking length and reducing the length of all the other cause there is only 7 images but all the other has 20. it will give error while creating dataFrame. so we do this..**

In [248]:
# Check lengths of each list
print(f'Usernames length: {len(usernames)}')
print(f'Repo names length: {len(repo_names)}')
print(f'Repo descriptions length: {len(repo_description)}')
print(f'Repo URLs length: {len(repo_urls)}')
print(f'Repo stars length: {len(converted_repo_stars)}')
print(f'Repo images length: {len(repo_images)}')

Usernames length: 20
Repo names length: 20
Repo descriptions length: 20
Repo URLs length: 20
Repo stars length: 20
Repo images length: 7


In [250]:
# Determine the minimum length among all lists
min_length = min(len(usernames), len(repo_names), len(repo_description), len(repo_urls), len(converted_repo_stars), len(repo_images))
min_length

7

In [251]:
# Truncate all lists to the minimum length
usernames = usernames[:min_length]
repo_names = repo_names[:min_length]
repo_description = repo_description[:min_length]
repo_urls = repo_urls[:min_length]
converted_repo_stars = converted_repo_stars[:min_length]
repo_images = repo_images[:min_length]

In [252]:
# creating a dictionary of repo details
repo_info_dict = {'Repo Name': repo_names, 
                  'User Name': usernames, 
                  'Repo Description': repo_description, 
                  'Repo Url': repo_urls, 
                  'Repo Stars': converted_repo_stars, 
                  'Repo Profile': repo_images
                }

In [253]:
# converting repo details into a pandas DataFrame
repo_info = pd.DataFrame.from_dict(repo_info_dict)
repo_info

Unnamed: 0,Repo Name,User Name,Repo Description,Repo Url,Repo Stars,Repo Profile
0,tensorflow,tensorflow,An Open Source Machine Learning Framework for ...,https://github.com/tensorflow/tensorflow,184000,https://repository-images.githubusercontent.co...
1,transformers,huggingface,🤗 Transformers: State-of-the-art Machine Learn...,https://github.com/huggingface/transformers,129000,https://repository-images.githubusercontent.co...
2,pytorch,pytorch,Tensors and Dynamic neural networks in Python ...,https://github.com/pytorch/pytorch,80400,https://repository-images.githubusercontent.co...
3,netdata,netdata,The open-source observability platform everyon...,https://github.com/netdata/netdata,69600,https://repository-images.githubusercontent.co...
4,ML-For-Beginners,microsoft,"12 weeks, 26 lessons, 52 quizzes, classic Mach...",https://github.com/microsoft/ML-For-Beginners,68300,https://repository-images.githubusercontent.co...
5,cs-video-courses,Developer-Y,List of Computer Science courses with video le...,https://github.com/Developer-Y/cs-video-courses,66000,https://repository-images.githubusercontent.co...
6,keras,keras-team,Deep Learning for humans,https://github.com/keras-team/keras,61300,https://repository-images.githubusercontent.co...


In [257]:
# converting pandas DataFrame to csv
repo_info.to_csv('Ml Topic Repo Info\'s.csv', index=None)

In [258]:
df = pd.read_csv('./Ml Topic Repo Info\'s.csv')
df.head()

Unnamed: 0,Repo Name,User Name,Repo Description,Repo Url,Repo Stars,Repo Profile
0,tensorflow,tensorflow,An Open Source Machine Learning Framework for ...,https://github.com/tensorflow/tensorflow,184000,https://repository-images.githubusercontent.co...
1,transformers,huggingface,🤗 Transformers: State-of-the-art Machine Learn...,https://github.com/huggingface/transformers,129000,https://repository-images.githubusercontent.co...
2,pytorch,pytorch,Tensors and Dynamic neural networks in Python ...,https://github.com/pytorch/pytorch,80400,https://repository-images.githubusercontent.co...
3,netdata,netdata,The open-source observability platform everyon...,https://github.com/netdata/netdata,69600,https://repository-images.githubusercontent.co...
4,ML-For-Beginners,microsoft,"12 weeks, 26 lessons, 52 quizzes, classic Mach...",https://github.com/microsoft/ML-For-Beginners,68300,https://repository-images.githubusercontent.co...
