In [None]:
#!pip install toml flask flask_cors requests beautifulsoup4 PyGithub retry 
import toml
import pandas as pd
import numpy as np
import time

import os
import re
from datetime import datetime
from github import Github, RateLimitExceededException
from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException
from retry import retry

from flask import Flask, request, jsonify
from flask_cors import CORS

from pathlib import Path
from transformers import AutoTokenizer

### Get a clean list of GitHub Repositories of Near code

In [None]:
# load data from Electric Capital https://github.com/electric-capital/crypto-ecosystems/blob/master/data/ecosystems/n/near.toml 
data = toml.load("near.toml")
print(data)

In [None]:
data = pd.DataFrame(data['repo'])
data.head()

In [None]:
data.isna().sum()
# no interesting info in 'missing' and 'tags' columns

In [None]:
data.columns

In [None]:
data = data.drop(columns = ['missing', 'tags'])
data.head()

In [None]:
# clean duplicated repo (create new column 'name' + break url + delete duplicated name)
data['name'] = data['url']
data.head()

In [None]:
for index, value in enumerate(data['name']):
    value = value.split('/')
    value = value[-1]
    data.at[index, 'name'] = value

print(len(data))
data.head()

In [None]:
# are there duplicated repos?
data['name'].value_counts()

In [None]:
# what is the total of duplicated rows
print(data['name'].duplicated().sum())

In [None]:
# at the end, there should be x unique rows
print(len(data) - data['name'].duplicated().sum())

In [None]:
# what is the number of unique values in the name column?
print(len(data['name'].unique()))

In [None]:
# Delete duplicated repo name while keeping the original repo from Near (maintained)
rootUrl = 'https://github.com/near/'

# Filter rows based on the root url in the url column
filtered_data = data[data['url'].str.startswith(rootUrl)]

# Drop duplicates in the 'name' column while keeping the first occurrence
unique_names = filtered_data.drop_duplicates(subset=['name'])

# delete all duplicates in the original data frame
data = data.drop_duplicates(subset=['name'])

# replace the url with the original url from filtered data
for index, row in filtered_data.iterrows():
    # Find rows in data where the 'name' column matches the value in filtered_data
    condition = data['name'] == row['name']
    # Replace values in 'url' column of data with 'url' from filtered_data
    data.loc[condition, 'url'] = row['url']

print(len(data))

# verify by printing the url for name 'near-sdk-rs'
print(data[data['name'] == 'near-sdk-rs'])

In [None]:
#reset index
data.reset_index(drop=True)

In [None]:
# iterate over urls to scrape GitHub repo
for index, urls in data['url'].items():
    print(index,urls)

In [None]:
data.index[data['url'] == 'https://github.com/howmarketing/fluxus-app-front'].tolist()

In [None]:
for index, url in data['url'].iloc[5242:].items():
    # Perform your operations here
    print(index, url)

### Launch the RepoToText app locally, and scrape the GitHub repositories in dataframe
https://github.com/JeremiahPetersen/RepoToText

In [None]:
#removing the sending of text into df because file is saved locally
#data['repototext'] = ''

for index, url in data['url'].iloc[5243:].items():
    # Construct the payload
    payload = {
        'repoUrl': url,
        'docUrl': '',  # Optional: Documentation URL
        'selectedFileTypes': ['.apk','.rs','.toml','.txt','.py','.js','.sql','.env','.json','.html','.css','.md','.ts','.java','.cpp','.c','.cs','.php','.rb','.xml','.yml','.sh','.swift','.h','.pyw','.asm','.bat','.cmd','.cls','.coffee','.erb','.go','.groovy','.htaccess','.java','.jsp','.lua','.make','.matlab','.pas','.perl','.pl','.ps1','.r','.scala','.scm','.sln','.svg','.vb','.vbs','.xhtml','.xsl',] # all files
    }
    
    # Send POST request to the Flask app
    try:
        print(index, url)
        response = requests.post('http://localhost:5000/scrape', json=payload)
        if response.status_code == 200:
            # Assuming the response contains the scraped text in a 'response' field
            scraped_text = response.json().get('response', '')
            
            # Append the scraped text to the DataFrame
            # Assuming 'repototext' column already exists
            #data.loc[data['url'] == url, 'repototext'] = scraped_text
        else:
            print(f"Failed to scrape {url}. Status code: {response.status_code}")
    except Exception as e:
        print(f"Error occurred while scraping {url}: {e}")
    
    # Sleep for a few seconds to avoid hitting the server too frequently
    #time.sleep(3)

# Now data DataFrame contains the scraped text in the 'repototext' column

In [None]:
data.head()

In [None]:
print(data['repototext'].iloc[1418])

In [None]:
data.to_csv('nearCode.csv', index=False)

In [None]:
#clean the scraped files from duplicated code
#clean the dataframe from successful scraping and failed scraping
#loop over failed scraping to collect more apps.