In [32]:
# Load the "autoreload" extension. Prior to executing code, modules are reloaded. 
# There's no need to restart jupyter notebook if you modify code in the `src` directory.
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

from src.data import make_dataset
import pandas as pd
from langdetect import detect
import string
import emoji
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import requests
import boto3
import json
import pickle
import cloudpickle

import nltk
from nltk import word_tokenize 
from nltk.stem import WordNetLemmatizer 
nltk.download('punkt')
nltk.download('wordnet')

pd.set_option('display.max_rows',500)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


[nltk_data] Downloading package punkt to /Users/dlite/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/dlite/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
########################################################
### Import Dataset
########################################################

In [9]:
##TODO: how should files be referenced properly?
raw_github_data = pd.read_csv('../data/raw/2020-04-06.tsv', sep='\t', header=0)


In [10]:
########################################################
### Clean Dataset
########################################################

In [11]:
# Filtering down to repos that are likely needing contributors based on past behavior
raw_github_data_filtered = raw_github_data[(raw_github_data['has_merged_prs'] == True) &
    (raw_github_data['has_readme'] == True) &
    (pd.isna(raw_github_data['repo_description']) == False) &
    (pd.isna(raw_github_data['primary_language_name']) == False) &
    (raw_github_data['count_distinct_contributors'] >=2)
]

In [12]:
# Detect language with error handling
def detect_with_error_handle(x):
    try:
        return detect(x)
    except:
        return 'Error'
    
# Check for only latin characters
def has_only_latin_letters(text):
    char_set = string.printable + 'â€”'
    return all((True if x in char_set else False for x in text))

# Remove punctuation
def remove_punctuation(text):
    punctuation_list = string.punctuation + 'â€”'
    return text.translate(str.maketrans('', '', punctuation_list))

In [13]:
## Full set of text processing

# check language, limit to english, and limit repo's with latin characters. Emojis are converted in the process
raw_github_data_filtered['language'] = raw_github_data_filtered['repo_description'].apply(lambda x: 'None' if pd.isna(x) else detect_with_error_handle(str(x)))
raw_github_data_filtered = raw_github_data_filtered[raw_github_data_filtered['language'] == 'en'].copy()
raw_github_data_filtered['is_latin_only_characters'] = raw_github_data_filtered['repo_description'].apply(lambda x: has_only_latin_letters(emoji.demojize(x)))
raw_github_data_filtered = raw_github_data_filtered[raw_github_data_filtered['is_latin_only_characters'] == True].copy()

# clean up repo description, topic, and language, combine into one big bag o' words
raw_github_data_filtered['repo_description_cleaned'] = raw_github_data_filtered['repo_description'].apply(lambda x: remove_punctuation(x))
raw_github_data_filtered['topics'] = raw_github_data_filtered.apply(lambda x: remove_punctuation(str(x['topics']).replace(',','').replace('nan','')), axis=1)
raw_github_data_filtered['topics'].fillna('', inplace=True)
raw_github_data_filtered['description_plus_topics'] = raw_github_data_filtered['repo_description_cleaned']+' '+raw_github_data_filtered['topics']+' '+raw_github_data_filtered['primary_language_name']
raw_github_data_filtered.reset_index(drop=True, inplace=True)

# create repo-lookup object for later use
repo_lookup = raw_github_data_filtered.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [14]:
########################################################
### Tokenize
########################################################

In [15]:
# Create class to be used by tokenizer to lemmatize... which change matches words to their roots
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [16]:
# Create a list of stop words that should be removed before tokenizing
stopwords = list(ENGLISH_STOP_WORDS) + ['covid19','coronavirus','virus','corona','covid','pandemic','sarscov2','outbreak','19','disease','2019','2019ncov','cord19','repository','repo','2020','20','covid2019','covidvirus', 'cases','case']

# Create vectorizor of n-grams using stop words and lemmatizer
word_vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word',stop_words=stopwords, tokenizer=LemmaTokenizer())

# Fit vectorizer on existing list of repos and create sparse matrix
sparse_vector_matrix = word_vectorizer.fit_transform(raw_github_data_filtered['description_plus_topics'])

  'stop_words.' % sorted(inconsistent))


In [17]:
########################################################
### Export Objects
########################################################

In [33]:
with open('../models/word_vectorizer.pickle', 'wb') as f:
    cloudpickle.dump(word_vectorizer, f)
    
with open('../models/sparse_vector_matrix.pickle', 'wb') as f:
    pickle.dump(sparse_vector_matrix, f)

with open('../models/repo_lookup.pickle', 'wb') as f:
    pickle.dump(repo_lookup, f)

In [19]:
########################################################
### Test invoking the model
########################################################

In [37]:
from src.models.model_wrapper import ModelWrapper
m = ModelWrapper()
m.predict(["python dashboard"])

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,github_repo_url,repo_description,topics,owner_repo_name,owner_name,owner_type,organization_bio,repo_created_day,primary_language_name,license_name,...,count_commits,count_commit_comments,count_created_issues,count_pull_requests_created,count_pull_requests_reviews,count_comments_on_issues_and_pull_requests,language,is_latin_only_characters,repo_description_cleaned,description_plus_topics
40,https://github.com/RamiKrispin/coronavirus,the coronavirus dataset,,RamiKrispin/coronavirus,RamiKrispin,User,,2020-02-11,R,other,...,152,1,37,3,1,153,en,True,the coronavirus dataset,the coronavirus dataset R
1134,https://github.com/CBDRH/covidrecon,r tools for monitoring effectiveness of covid-...,,CBDRH/covidrecon,CBDRH,Organization,the centre for big data research in health is ...,2020-03-20,R,gpl-3.0,...,48,0,12,1,0,5,en,True,r tools for monitoring effectiveness of covid1...,r tools for monitoring effectiveness of covid1...
179,https://github.com/nevrome/covid19germany,"r package - load, visualise and analyse daily ...",coronavirus dataretrieval covid19 germany r,nevrome/covid19germany,nevrome,User,,2020-03-21,R,other,...,162,0,7,18,4,42,en,True,r package load visualise and analyse daily up...,r package load visualise and analyse daily up...
80,https://github.com/JohnCoene/coronavirus,ðŸ¦ novel coronavirus (covid-19) tracker,covid19 rstats ncov ncov2019 r 2019ncov corona...,JohnCoene/coronavirus,JohnCoene,User,,2020-02-02,R,other,...,256,0,16,3,0,59,en,True,ðŸ¦ novel coronavirus covid19 tracker,ðŸ¦ novel coronavirus covid19 tracker covid19 rs...
1313,https://github.com/aangelopoulos/cfr-covid-19,implementation of https://arxiv.org/abs/2003.0...,,aangelopoulos/cfr-covid-19,aangelopoulos,User,,2020-03-25,R,,...,8,0,0,1,0,0,en,True,implementation of httpsarxivorgabs200308592,implementation of httpsarxivorgabs200308592 R
459,https://github.com/RamiKrispin/covid19Italy,italy covid19 data,,RamiKrispin/covid19Italy,RamiKrispin,User,,2020-03-17,R,other,...,117,0,3,0,0,1,en,True,italy covid19 data,italy covid19 data R
254,https://github.com/Lrakotoson/Covid-19,ðŸ¦ dashboard to follow in real time the covid-1...,covid19 dashboard coronavirus r dashboardappli...,Lrakotoson/Covid-19,Lrakotoson,User,,2020-02-25,R,mit,...,88,0,8,5,2,30,en,True,ðŸ¦ dashboard to follow in real time the covid19...,ðŸ¦ dashboard to follow in real time the covid19...
119,https://github.com/swsoyee/2019-ncov-japan,ðŸ¦ interactive dashboard of covid-19 cases in j...,2019ncov covid19 shinyapps interactivevisualiz...,swsoyee/2019-ncov-japan,swsoyee,User,,2020-01-30,R,mit,...,1490,3,22,37,5,46,en,True,ðŸ¦ interactive dashboard of covid19 cases in japan,ðŸ¦ interactive dashboard of covid19 cases in ja...
225,https://github.com/cdcepi/COVID-19-ILI-forecas...,covid-19 ili forecasting for the u.s.,,cdcepi/COVID-19-ILI-forecasting,cdcepi,Organization,,2020-03-11,R,,...,46,0,2,6,0,3,en,True,covid19 ili forecasting for the us,covid19 ili forecasting for the us R
232,https://github.com/hamilton-institute/covid19i...,a visualisation tool of covid-19 for ireland,,hamilton-institute/covid19ireland,hamilton-institute,Organization,the hamilton institute (mu) github organization,2020-03-21,R,,...,230,0,0,1,0,0,en,True,a visualisation tool of covid19 for ireland,a visualisation tool of covid19 for ireland R
