In [1]:
import re
import base64
import json
import requests
import os

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from scipy.sparse import vstack,csr_matrix,save_npz

def hash(text:str):
    h=0
    for ch in text:
        h = ( h*281  ^ ord(ch)*997) & 0xFFFFFFFF
    return h
# Due to limit of GitHub api calls, only a subsample of repos will be explored at this time between the range (bg)th repo to (nd)th repo
# If in between the api limit is reached before the completion of reading from bg to nd, then the process will terminate 
# saving the obtained data in npz files with the index representing the range of values successfully read and the program will output the
# index in which the api call limit exceeded.
bg = 5667
nd = 5750

In [2]:
# Add False or None to github_tokens list for using Github Api in non-authorized manner
github_tokens = ["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","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"]
ptr = 0
github_token = github_tokens[ptr]
MOD = 10007
MOD_space = []
for i in range(MOD):
    MOD_space.append(str(i))
MOD_space = ' '.join(MOD_space)

stop_procedure = False
stop_index = bg - 1

source_extentions = ['py','ipynb','cpp','c','cfg','js','json','vue',"xml","java","sh","php","rb","ts"]
text = []
source = []

def github_read_file(username, repository_name, file_path):
    headers = {}
    global github_token
    if github_token:
        headers['Authorization'] = f"token {github_token}"
        
    url = f'https://api.github.com/repos/{username}/{repository_name}/contents/{file_path}'
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        data = r.json()
    except:
        return ""
    file_content = data['content']
    file_content_encoding = data.get('encoding')
    if file_content_encoding == 'base64':
        file_content = base64.b64decode(file_content).decode()
    return file_content


def get_files(username,repository_name,file_path):
    file_content = github_read_file(username, repository_name, file_path)
    return file_content


def get_contents(username, repository_name):
    headers = {}
    global github_token
    if github_token:
        headers['Authorization'] = f"token {github_token}"
        
    url = f'https://api.github.com/repos/{username}/{repository_name}/contents'
    r = requests.get(url, headers=headers)
    try:
        r.raise_for_status()    
        data = r.json()
        return data
    except:
        return []

def recur(user,repo,r):
    headers = {}
    global github_token,text,source
    if github_token:
        headers['Authorization'] = f"token {github_token}"
    for i in r.json():
        if i["name"].startswith('.'):
            continue
        if i['type'] == 'file':
            if i['name'].split('.')[-1] == 'md' or i['name'].split('.')[-1] == 'txt':
                text.append(i)
            elif i['name'].split('.')[-1] in source_extentions :
                source.append(i)
        elif i['type'] == 'dir':
            url = i['url']
            try:
                r = requests.get(url, headers=headers)
                r.raise_for_status()
                recur(user,repo,r)
            except:
                pass
        
def get_filenames(user, repo):
    headers = {}
    global github_token,source,text,stop_procedure
    source = []
    text = []
    if github_token:
        headers['Authorization'] = f"token {github_token}"
    url = f'https://api.github.com/repos/{user}/{repo}/contents'
    try:
        r = requests.get(url, headers=headers)
        r.raise_for_status()
        recur(user,repo,r)
    except:
        try:
            print("error @ get_filenames",user,repo)
            print(r.json())
            if r.json()['message'] not in  ['This repository is empty.' , "Repository access blocked" , "Not Found"]:
                stop_procedure = True
        except:
            print("Dual Error")
            print(r,r.json())
 
def extract_words(user,repo,text):
    puntuation = '1234567890-=!@#$%^&*()+[]{};:"\'|\\<>~/?`<>.,'
    out = ""
    ps = PorterStemmer()    
    for i in text:
        try:
            data = get_files(user,repo,i['path'])
            for i in puntuation:
                data = data.replace(i," ")
            data = data.replace("\n",' ')
            data = re.sub(r'\b\w{1,2}\b', '', data)
            data += ' '
            data = re.sub("\s\s+" , " ", data)
            if len(data)>0 and data[0]==' ':
                data = data[1:]
            data = data.lower()
            ps.stem(data)
            data = ' '.join([str(hash(i)%MOD) for i in data.split(' ')])
            out += data + ' '
        except:
            print("error reading",i["name"])
            pass
    return out

In [3]:
# Code to import all significant repos of users from users.txt file
# Possible errors if the github token is invalid
def get_repos(path):
    with open(path,'r') as f:
        save = [x.strip() for x in f.readlines()]
    headers = {}
    global github_token
    if github_token:
        headers['Authorization'] = f"token {github_token}"
    l = []
    ec = 0
    Error_Limit = 30
    for i in save:
        try:
            r = requests.get("https://api.github.com/users/{}/repos".format(i),headers = headers)
            r.raise_for_status()
            for e in r.json():
                if e['full_name'] not in l:
                    l.append(e['full_name'])
        except:
            ec+=1
            print("error @ ",i)
        if ec>Error_Limit:
            break
    out = open("./repos.txt",'w')
    print(out.writelines([i+'\n' for i in l]))
    out.close()
#get_repos("../input/githubrecsys1/users.txt")

In [4]:
with open("../input/githubrecsys1/repos.txt","r") as f:
    out = [l.strip() for l in f]
ind = {}

for i in range(len(out)):
    ind[out[i]] = i
    
Repo_text_hashed = []
Repo_source_hashed = []
for i in out[bg:nd]:
    global github_token,stop_procedure,stop_index
    print(i)
    stop_index+=1
    ptr+=1
    ptr %= len(github_tokens)
    github_token = github_tokens[ptr]
    get_filenames(*i.split("/"))
    if stop_procedure:
        break
    Repo_text_hashed.append(extract_words(*i.split("/"),text))
    Repo_source_hashed.append(extract_words(*i.split("/"),source))
    
if not stop_procedure:
    stop_index+=1
    
vocab = {}
for i in range(MOD):
    vocab[str(i)] = i
    
Repo_text_points = []
Repo_source_points = []
idf_count_text = []
idf_count_source = []

def get_mapper(voc):
    def mapper(key):
        if key not in voc:
            key = '0'
        return voc[key]
    return mapper

mapper = get_mapper(vocab)
for i in Repo_text_hashed:
    res = map(lambda x:mapper(x),i.split(' '))
    temp = np.bincount(list(res),minlength = MOD)
    if len(i.split(' '))>0:
        temp = temp / np.array([len(i.split(' '))],dtype = np.float64)
    Repo_text_points.append(csr_matrix(temp , dtype  = np.float64))
    temp = np.zeros(MOD)
    for j in set(i.split(' ')):
        temp[mapper(j)] = 1
    idf_count_text.append(csr_matrix(temp,dtype = np.float64))

for i in Repo_source_hashed:
    res = map(lambda x:mapper(x),i.split(' '))
    temp = np.bincount(list(res),minlength = MOD)
    if len(i.split(' '))>0:
        temp = temp / np.array([len(i.split(' '))],dtype = np.float64)
    Repo_source_points.append(csr_matrix(temp,dtype = np.float64))
    temp = np.zeros(MOD)
    for j in set(i.split(' ')):
        temp[mapper(j)] = 1
    idf_count_source.append(csr_matrix(temp,dtype = np.float64))

del Repo_source_hashed,Repo_text_hashed
Repo_text_points = vstack(Repo_text_points)
Repo_source_points = vstack(Repo_source_points)
idf_count_source = vstack(idf_count_source)
idf_count_text = vstack(idf_count_text)

AngelicaTheran/prueba
AngelicaTheran/repasojs
AngelicaTheran/Trivia
y3rb1t4/Algoritmos-y-Estructuras-de-Datos
y3rb1t4/banco-gd
y3rb1t4/htb-arg
y3rb1t4/kali-clean
y3rb1t4/learn_csharp
y3rb1t4/learn_devops
y3rb1t4/learn_python
y3rb1t4/lsrootkit
y3rb1t4/ml-python-utn
y3rb1t4/my-notes
y3rb1t4/neovim
y3rb1t4/nextjs-course-app
y3rb1t4/nodejs
y3rb1t4/Portfolio
y3rb1t4/production-grade-nextjs
y3rb1t4/pwi-heroku-deployment
y3rb1t4/react-2021
y3rb1t4/utn-react-2021
y3rb1t4/y3rb1t4
miguel9903/API-REST-NodeJS-Express-MongoDB
miguel9903/AppClima-NodeJS
miguel9903/AppListaTareas-NodeJS
miguel9903/Basic-CRUD-Node-Express
miguel9903/Basic-Server-Node-Express
miguel9903/CRUD-Firebase-Angular
miguel9903/Curso-CSS-Ed-2018
error reading LICENSE.txt
miguel9903/Curso-JS-Ed-2018-2020
miguel9903/Ejemplo-Responsive-Flex
miguel9903/Ejercicio-SASS
miguel9903/Ejercicios-Maquetacion-HTML-CSS-JS
miguel9903/Formularios-Angular
miguel9903/Frontend-I-CTD
miguel9903/LoginApp-Angular
miguel9903/MEAN-STACK-Backend
miguel

In [5]:
print(stop_index)
stop_index -= 1
save_npz("./Repo_TextFeature_{}:{}".format(bg,stop_index),Repo_text_points)
save_npz("./Repo_SourceFeature_{}:{}".format(bg,stop_index),Repo_source_points)
save_npz("./Repo_IDF_count_text_{}:{}".format(bg,stop_index),idf_count_text)
save_npz("./Repo_IDF_count_source_{}:{}".format(bg,stop_index),idf_count_source)

5750


In [6]:
# Decomment : Heatmap of cosine-similarity matrix between source and text files
'''from sklearn.metrics.pairwise import cosine_similarity
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
Repo_similarity_text = cosine_similarity(Repo_text_points,Repo_text_points)
Repo_similarity_source = cosine_similarity(Repo_source_points,Repo_source_points)
Repo_similarity_text = pd.DataFrame(Repo_similarity_text,columns = out[bg:nd],index = out[bg:nd])
Repo_similarity_source = pd.DataFrame(Repo_similarity_source,columns = out[bg:nd],index = out[bg:nd])
plt.figure(figsize = (15,30))
plt.subplot(211)
sns.heatmap(Repo_similarity_text)
plt.subplot(212)
sns.heatmap(Repo_similarity_source)
plt.savefig("sample_similarity_matrix")'''

'from sklearn.metrics.pairwise import cosine_similarity\nimport seaborn as sns\nimport pandas as pd\nimport matplotlib.pyplot as plt\nRepo_similarity_text = cosine_similarity(Repo_text_points,Repo_text_points)\nRepo_similarity_source = cosine_similarity(Repo_source_points,Repo_source_points)\nRepo_similarity_text = pd.DataFrame(Repo_similarity_text,columns = out[bg:nd],index = out[bg:nd])\nRepo_similarity_source = pd.DataFrame(Repo_similarity_source,columns = out[bg:nd],index = out[bg:nd])\nplt.figure(figsize = (15,30))\nplt.subplot(211)\nsns.heatmap(Repo_similarity_text)\nplt.subplot(212)\nsns.heatmap(Repo_similarity_source)\nplt.savefig("sample_similarity_matrix")'

In [7]:
# Decomment : To discover the kind of error that is encountered while reading a given file
'''headers = {}
if github_token:
    headers['Authorization'] = f"token {github_token}"
requests.get("https://api.github.com/repos/filhodanuvem/7languages7weeks").json()'''

'headers = {}\nif github_token:\n    headers[\'Authorization\'] = f"token {github_token}"\nrequests.get("https://api.github.com/repos/filhodanuvem/7languages7weeks").json()'

In [8]:
# Decomment : To discover the kind of error that is encountered while reading a given file
'''headers = {}
if github_token:
    headers['Authorization'] = f"token {github_token}"
requests.get("https://api.github.com/repos/miguel9903/Curso-CSS-Ed-2018",headers = headers).json()'''

'headers = {}\nif github_token:\n    headers[\'Authorization\'] = f"token {github_token}"\nrequests.get("https://api.github.com/repos/miguel9903/Curso-CSS-Ed-2018",headers = headers).json()'