In [1]:
import os
import json
from typing import Dict, List, Optional, Union, cast
from env import github_token, github_username
import acquire
import pandas as pd
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import regex as re
import time
import numpy as np
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud
import prepare_repos
import collections
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
def create_urls(num=5000):
    ''' this function scrapes the cryptography repositories from github and returns a list of urls
    '''
    num_of_repos=num

    page_numbers = [i for i in range(0,101)]
    print(page_numbers)
    urls = [f'https://github.com/search?p={i}&q=%23defi&type=Repositories&per_page=100' for i in page_numbers]

    print(urls)
    return urls

##### create_urls()

In [None]:
def get_endpoints(url):
    ''' This function gets the endpoints from the list of above urls
    '''

    headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}
    
    while True:
        response = requests.get(url, headers=headers)
        if response.ok:
            break
        else:
            print('sleeping')
            time.sleep(20)
            continue
    soup = BeautifulSoup(response.text)
    
    print(response.ok)

    endpoints = []
    subgroups = soup.find_all('div', {"class":"f4 text-normal"})

    for group in subgroups:
        endpoints.append(re.search('href=".*"', str(group))[0][6:-1])

    return endpoints

In [None]:
get_endpoints('https://github.com/search?p=100&q=%23defi&type=Repositories&per_page=100')

In [None]:
def make_all_endpoints():
    ''' This function returns all of the endpoints
    '''
    urls = create_urls()
    for url in urls:
        print(url)
    all_endpoints = []

    for i, page in enumerate(urls):
        all_endpoints.append(get_endpoints(page))
        print(page)

    print(len(all_endpoints))

    return all_endpoints

In [None]:
make_all_endpoints()

In [None]:
def acquire_endpoints():
    ''' This function acquires all endpoints and writes them to a csv.
    '''
    our_endpoints = pd.Series(make_all_endpoints(), name='endpoints')
    our_endpoints.to_csv('endpoints.csv', index=False)

    return our_endpoints

In [None]:
x=acquire_endpoints()

In [None]:
def flatten_endpoints():
    ''' This function flattens a 2d array into a 1d array
    '''
    end_points = pd.read_csv('endpoints.csv')
    all_values = []
    for value in end_points.values:
        for ep in value:
            all_values.append(ep)

    final_values = []
    #print(all_values)
    for value in all_values:
        for val in value.split("'"):
            if len(val) > 3:
                final_values.append(val)
                print(val)

    return pd.Series(final_values, name='endpoints')

In [None]:
endpoints = flatten_endpoints()

In [None]:
REPOS = list(endpoints)

In [None]:
REPOS[5:]

In [None]:

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    ''' This function makes requests from github and raises an error code if a specific error code is received.
    '''
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    ''' This function fetches the language associated with a repository
    '''
    url = f"https://api.github.com/repos{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )

def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    ''' This function fetches the contents associated with a repo
    '''
    url = f"https://api.github.com/repos{REPOS}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )

def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""

def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        response = requests.get(readme_download_url)
        print(readme_download_url)
        print(response.status_code)
        readme_contents = requests.get(readme_download_url).text
        
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


In [None]:
def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    
    output = []
    for repo in REPOS:
        
        url = f"https://api.github.com/repos{repo}/contents/"
        print(repo)
        response = requests.get(url, headers=headers)
        
        if response.status_code != 200:
            print(f"Skipping {repo} because its HTTP status code is {response.status_code}")
            continue
        
        contents = response.json()
        readme_download_url = get_readme_download_url(contents)
        
        if readme_download_url == "":
            readme_contents = ""
        else:
            response = requests.get(readme_download_url)
            if response.status_code != 200:
                print(f"Skipping {repo} because its HTTP status code is {response.status_code}")
                continue
            readme_contents = requests.get(readme_download_url).text
        print(repo)
        result = {
            "repo": repo,
            "language": get_repo_language(repo),
            "readme_contents": readme_contents,
        }
    
        output.append(result)
        
    return output

In [None]:
data = scrape_github_data()

In [None]:
data[:5]

In [None]:
import csv
to_csv = data
keys = to_csv[0].keys()

with open('data.csv', 'w', newline='') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(to_csv)

In [2]:
df = pd.read_csv('data.csv') 
df.head()

Unnamed: 0,repo,language,readme_contents
0,/OffcierCia/DeFi-Developer-Road-Map,,# DeFi Developer Road Map\n\n**Here we collect...
1,/smartcontractkit/full-blockchain-solidity-cou...,,<!-- [YouTube Video](https://www.youtube.com/w...
2,/rainbow-me/rainbow,TypeScript,![](https://pbs.twimg.com/profile_banners/1103...
3,/Bytom/bytom,Go,Bytom\n======\n\n[![Build Status](https://trav...
4,/DimensionDev/Maskbook,TypeScript,<!-- cspell:disable -->\n<!-- markdownlint-dis...


In [None]:
df.language.value_counts()

### Prepare

In [None]:
df.isnull().sum()

In [4]:
df = df.dropna()
df.shape

(976, 3)

In [3]:
df = prepare_repos.fill_nulls(df)
df.shape

(1010, 3)

In [5]:
df['readme_contents'] = df.readme_contents.apply(prepare_repos.basic_clean)
df.head()

Unnamed: 0,repo,language,readme_contents
0,/OffcierCia/DeFi-Developer-Road-Map,Not Specified,defi developer road map\n\n here we collect...
1,/smartcontractkit/full-blockchain-solidity-cou...,Not Specified,youtube video https www youtube com w...
2,/rainbow-me/rainbow,TypeScript,https pbs twimg com profile banners 1103...
3,/Bytom/bytom,Go,bytom\n \n\n build status https trav...
4,/DimensionDev/Maskbook,TypeScript,cspell disable \n markdownlint dis...


In [6]:
df['readme_contents'] = df.readme_contents.apply(prepare_repos.tokenize).apply(prepare_repos.lemmatize).apply(prepare_repos.remove_stopwords)
df.head()

Unnamed: 0,repo,language,readme_contents
0,/OffcierCia/DeFi-Developer-Road-Map,Not Specified,defi developer road map collect discus best de...
1,/smartcontractkit/full-blockchain-solidity-cou...,Not Specified,youtube video http www youtube com watch v m57...
2,/rainbow-me/rainbow,TypeScript,http pb twimg com profile banner 1103191459409...
3,/Bytom/bytom,Go,bytom build status http travis ci org bytom by...
4,/DimensionDev/Maskbook,TypeScript,cspell disable markdownlint disable inline htm...


In [None]:
df['word_count'] = df['readme_contents'].apply(lambda x : len(x.split()))

In [None]:
df['num_unique_words'] = df['readme_contents'].apply(lambda x: len(set(w for w in x.split())))
df['num_repeated_words'] = df['readme_contents'].apply(lambda x: len([w for w in collections.Counter(x.split()).values() if w > 1]))
df['character_count'] = df['readme_contents'].apply(lambda x : len(x.replace(" ","")))
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.word_count.plot()

In [None]:
df.to_csv('clean_df.csv', index = False)

In [None]:
fig = px.bar(df, x='language', y='word_count', title='Counts of words', template='plotly_white', labels={'ngram': 'Bigram', 'count': 'Count'})
fig.show()

In [None]:
fig = px.histogram(df, x='language', template='plotly_white', title='Complain counts by length')
fig.update_xaxes(categoryorder='total descending', title='Number of words (clipped at 1000 words)').update_yaxes(title='Number of complaints')
fig.show()

In [None]:
df.head()

In [25]:
# Limit languagues with less than 4 spots in a repo
remaining_languages = df.language.value_counts()[df.language.value_counts() > 4].index

# Removing languages not kept
df = df[df.language.isin(remaining_languages)]

In [None]:
df.shape

In [None]:
labels = pd.concat([df.language.value_counts(),
                    df.language.value_counts(normalize=True)], axis=1)
labels.columns = ['n', 'percent']
labels

In [None]:
df.language.value_counts()

In [None]:
java_words = prepare_repos.clean(' '.join(df[df.language == 'JavaScript'].readme_contents))
Tscript_words = prepare_repos.clean(' '.join(df[df.language == 'TypeScript'].readme_contents))
all_words = prepare_repos.clean(' '.join(df.readme_contents))
idk_words = prepare_repos.clean(' '.join(df[df.language == 'Not Specified'].readme_contents))
solidity_words = prepare_repos.clean(' '.join(df[df.language == 'Solidity'].readme_contents))
python_words = prepare_repos.clean(' '.join(df[df.language == 'Python'].readme_contents))

In [None]:
java_words

In [None]:
java_freq = pd.Series(java_words).value_counts()
Tscript_freq = pd.Series(Tscript_words).value_counts()
all_freq = pd.Series(all_words).value_counts()
idk_freq = pd.Series(idk_words).value_counts()
solidity_freq = pd.Series(solidity_words).value_counts()
python_freq = pd.Series(python_words).value_counts()
java_freq.head()

In [None]:
Tscript_freq.head()

In [None]:
word_counts = (pd.concat([all_freq, java_freq, Tscript_freq, idk_freq, solidity_freq, python_freq], axis=1, sort=True)
                .set_axis(['all', 'java', 'TypeScript', 'Not_Specified', 'Solidity', 'Python'], axis=1, inplace=False)
                .fillna(0)
                .apply(lambda s: s.astype(int)))

word_counts.head(15)

In [None]:
word_counts.sort_values(by='TypeScript', ascending=False).head(10)

In [None]:
all_cloud = WordCloud(background_color='white', height=1000, width=400).generate(' '.join(all_words))
java_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(java_words))
Tscript_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(Tscript_words))
idk_cloud = WordCloud(background_color='white', height=1000, width=400).generate(' '.join(idk_words))
Solidity_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(solidity_words))
python_cloud = WordCloud(background_color='white', height=600, width=800).generate(' '.join(python_words))

plt.figure(figsize=(10, 8))
axs = [plt.axes([0, 0, .5, 1]), plt.axes([.5, .5, .5, .5]), plt.axes([.5, 0, .5, .5]), plt.axes([.5, .5, .5, .5]), plt.axes([.5, .5, .5, .5]), plt.axes([.5, .5, .5, .5])]

axs[0].imshow(all_cloud)
axs[1].imshow(java_cloud)
axs[2].imshow(Tscript_cloud)
axs[3].imshow(idk_cloud)
axs[4].imshow(Solidity_cloud)
axs[5].imshow(python_cloud)

axs[0].set_title('All Words')
axs[1].set_title('java')
axs[2].set_title('Tscript_words')
axs[3].set_title('Not Specified')
axs[4].set_title('solidity')
axs[5].set_title('python')

for ax in axs: ax.axis('off')

In [None]:
plt.imshow(java_cloud)

In [None]:
word_counts.head()

In [None]:
(word_counts
 .assign(p_java=word_counts.java / word_counts['all'],
         p_TypeScript=word_counts.TypeScript / word_counts['all'],
         p_idk=word_counts.Not_Specified / word_counts['all'],
         p_Solidity=word_counts.Solidity / word_counts['all'],
         p_python=word_counts.Python / word_counts['all'])
 .sort_values(by='all')
 [['java', 'TypeScript', 'Solidity', 'Not_Specified', 'Python']]
 .tail(20)
 .sort_values('java')
 .plot.barh(stacked=True))

plt.title('Proportion of language for the 20 most common words')

In [23]:
df.head()

Unnamed: 0,repo,language,readme_contents
0,/OffcierCia/DeFi-Developer-Road-Map,Not Specified,defi developer road map collect discus best de...
1,/smartcontractkit/full-blockchain-solidity-cou...,Not Specified,youtube video http www youtube com watch v m57...
2,/rainbow-me/rainbow,TypeScript,http pb twimg com profile banner 1103191459409...
3,/Bytom/bytom,Go,bytom build status http travis ci org bytom by...
4,/DimensionDev/Maskbook,TypeScript,cspell disable markdownlint disable inline htm...


In [27]:

tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df.readme_contents)
y = df.language

X_train, X_validate, X_test, y_train, y_validate, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))
train.head()

ValueError: not enough values to unpack (expected 6, got 4)

In [7]:
def split(df, stratify_by='language'):
    # split df into train_validate test
    train_validate, test = train_test_split(df, test_size=.20, random_state=13)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=13)

    X_train = train.drop(columns=['language'])
    y_train = train[['language']]

    X_validate = validate.drop(columns=['language'])
    y_validate = validate[['language']]

    X_test = test.drop(columns=['language'])
    y_test = test[['language']]

    return train, X_train, X_validate, X_test, y_train, y_validate, y_test

In [8]:
from sklearn.model_selection import train_test_split
train, X_train, X_validate, X_test, y_train, y_validate, y_test = split(df, stratify_by='language') #split data
X_train.head()

Unnamed: 0,repo,readme_contents
217,/xBidi/PancakeSwapBot,update v2 pancakeswap prediction bot pancakesw...
332,/Lucas-Kohorst/awesome-defi,div align center title lint ignore dead url aw...
82,/TP-Lab/tp-js-sdk,tp j sdk tokenpocket http tokenpocket gz bcebo...
914,/FIREDAO/firedao-protocol,firedao protocol logo http github com firedao ...
744,/taraldefi/taral,readme ha generated file blueprint md p align ...


In [9]:
X_train = X_train.drop(columns=['repo'])
X_validate = X_validate.drop(columns=['repo'])
X_test = X_test.drop(columns=['repo'])
X_train.head()

Unnamed: 0,readme_contents
217,update v2 pancakeswap prediction bot pancakesw...
332,div align center title lint ignore dead url aw...
82,tp j sdk tokenpocket http tokenpocket gz bcebo...
914,firedao protocol logo http github com firedao ...
744,readme ha generated file blueprint md p align ...


## baseline 

In [None]:
df.language.value_counts()

In [10]:
# new df for predictions
predictions = pd.DataFrame({ 
    'actual': train.language
})


In [11]:
predictions['baseline'] = train[train['language'] == 'JavaScript'].shape[0] /train.shape[0]

In [12]:
predictions.head()

Unnamed: 0,actual,baseline
217,TypeScript,0.239927
332,Not Specified,0.239927
82,JavaScript,0.239927
914,TypeScript,0.239927
744,TypeScript,0.239927


In [None]:
df.info()

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(df['readme_contents'])
tfidfs

<976x44953 sparse matrix of type '<class 'numpy.float64'>'
	with 198672 stored elements in Compressed Sparse Row format>

In [21]:
features= pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names())
features



Unnamed: 0,00,000,0000,000000,00000000,000000000000000000,0000000000000000000000000000000000000000000000000000000000000001,000000000000000002,000000000000000282,00000000000009764515173366604499968328796917891,...,zzutbyhibihwi3qvkmone1dgl03vtm1w8oddusjedkbpbvaa8,zzuy29j4eoph7crhhnsaoqjid15ui3xwkxcxk5v5zxlqpez8rnuspt08meyoyovvfxpgql,zzvgq2npaunvb63owt33x,zzw,zzwmavvvbp3yapsrmfp4nswdwuaqiecbagqiaagrdivtjzx0zt85z9zf8a8rtn,zzxthngejsu8vr,zzxyl,zzy,zzy55b7axooubbgcvqn297r4p,zzzs
0,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.010906,0.000,0.000000,0.001573,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
971,0.183109,0.053,0.007972,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
972,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
973,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
974,0.000000,0.000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
features.info()

In [None]:
df['readme_contents'].head()

In [None]:
 #From the Series we can extract the value_counts, which is our raw count
# for term frequency. Once we have the raw counts, we can calculate the
# other measures.
df2 = (pd.DataFrame({'raw_count': word_counts.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

In [20]:
X_train.head()

Unnamed: 0,readme_contents
217,update v2 pancakeswap prediction bot pancakesw...
332,div align center title lint ignore dead url aw...
82,tp j sdk tokenpocket http tokenpocket gz bcebo...
914,firedao protocol logo http github com firedao ...
744,readme ha generated file blueprint md p align ...


In [19]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 546 entries, 217 to 757
Data columns (total 1 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   readme_contents  546 non-null    object
dtypes: object(1)
memory usage: 8.5+ KB


In [15]:
clf = DecisionTreeClassifier(max_depth=3, random_state=123)
clf = clf.fit(X_train, y_train)

plt.figure(figsize=(13, 7))
plot_tree(clf, feature_names=X_train.columns, class_names=clf.classes_, rounded=True)

ValueError: could not convert string to float: "update v2 pancakeswap prediction bot pancakeswap prediction game bot includes backtesting code try new strategy changing env variable risk free environment work pancakepredictionv2 http pancakeswap finance prediction use 1 provide private key env private key field 2 install dependency npm 4 start app npm run start 5 enjoy winning export private key metamask open account click three point top right corner account detail export private key backtesting pancakeswap data 1 update initialepoch current epoch pancake 5 example current epoch 25890 25885 2 update finalepoch current epoch pancake 500 example current epoch 25890 25390 3 install dependency npm 4 run npm run download data needed code contains history json file 5 run npm run backtest 6 open file chart png alt pancakeswap prediction bot winner image ppw image png alt pancakeswap prediction bot winner screenshot image ppw image 2 png alt candle genie bot winner screenshot image ppw image 3 png strategy bot strategy found src bot http github com xbidi pancakeswapbot blob main src bot l73 bet biggest bull bear payout const bet roundbullamount roundbearamount ' bull ' ' bear ' increase bet amount bot us strategy check env modify multiplier initial bet amount http en wikipedia org wiki martingale probability theory beware fork give guarantee fork may turn scam disclaimer ' coding stuff pure open source every time bot win donates small portion winning developer account continue improving bot 0xc3c531be09102e84d4273984e29e827d71e28ae8 investment strategy investment involve risk loss nothing contained program script code repository construed investment advice reference investment ' past potential performance construed recommendation guarantee specific outcome profit using program accept liability claim made developer others connected program"