# Acquire the Data

In [1]:
import pandas as pd
from bs4 import BeautifulSoup


import time
import os
import json
import requests
import unicodedata
import re

import acquire

from env import github_token, github_username

In [2]:
dw = pd.read_csv('dw.csv')
dw.head()

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
0,0,eccentricdevotion/TARDIS,Java,# TARDIS\n\nTARDIS is a Spigot / Paper plugin ...
1,1,kuralabs/reactive-core-doctor-who-web,JavaScript,==============================================...
2,2,fwallacephd/doctor-who,CSS,Notes about open source Doctor Who Project:\n\...
3,3,kuralabs/reactive-core-doctor-who-core,JavaScript,==============================================...
4,4,kuralabs/reactive-core-doctor-who-mobile,JavaScript,==============================================...


In [3]:
dw.shape

(285, 4)

# Prepare the Data

In [4]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

## First Look at the Data
- Are there any nulls?
    - Decided on how to take care of them

In [5]:
dw.isnull().sum()

Unnamed: 0           0
repo                 0
language            78
readme_contents    141
dtype: int64

In [6]:
# drop all the nulls
dw.dropna(inplace=True)

In [7]:
dw.isnull().sum()

Unnamed: 0         0
repo               0
language           0
readme_contents    0
dtype: int64

In [8]:
def drop_nulls(df):
    '''Takes null values from dataframe
    drops all the nulls'''
    # drop all the nulls
    dw.dropna(inplace=True)
    # return df
    return df

In [9]:
dw.shape

(115, 4)

In [10]:
dw.language.value_counts()

JavaScript          34
HTML                17
Java                15
Python              13
C#                   7
CSS                  7
Jupyter Notebook     4
Ruby                 3
TSQL                 2
TypeScript           2
C                    2
Kotlin               1
Swift                1
Arduino              1
Dart                 1
R                    1
PHP                  1
Go                   1
Vue                  1
Lua                  1
Name: language, dtype: int64

## Make all text lower case

In [11]:
def basic_clean(string):
    '''Takes in string
    makes everything lowercase
    removes incosistent text
    only keeps anything a-z, 0-9, ' and white space'''
    # make everything lowercase
    string = string.lower()
    # removes incosistencies in the text
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    # set what to keep
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    # return new cleaned string
    return string

In [12]:
dw['cleaned_content'] = dw.readme_contents.apply(basic_clean)
dw.head()

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents,cleaned_content
0,0,eccentricdevotion/TARDIS,Java,# TARDIS\n\nTARDIS is a Spigot / Paper plugin ...,tardis\n\ntardis is a spigot paper plugin th...
1,1,kuralabs/reactive-core-doctor-who-web,JavaScript,==============================================...,\ndoctor who web app reactive core architectu...
2,2,fwallacephd/doctor-who,CSS,Notes about open source Doctor Who Project:\n\...,notes about open source doctor who project\n\n...
3,3,kuralabs/reactive-core-doctor-who-core,JavaScript,==============================================...,\ndoctor who reactive core reactive core arch...
4,4,kuralabs/reactive-core-doctor-who-mobile,JavaScript,==============================================...,\ndoctor who mobile app reactive core archite...


In [13]:
def tokenize(string):
    '''Takes in the string provided by basic_clean funciton
    creates a tokenizer
    uses the tokenizerr on the cleaned string'''
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use the tokenizer
    string = tokenizer.tokenize(string, return_str = True)
    # return tokenized string
    return string

In [14]:
dw['tokenized_content'] = dw.cleaned_content.apply(tokenize)
dw.head()

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents,cleaned_content,tokenized_content
0,0,eccentricdevotion/TARDIS,Java,# TARDIS\n\nTARDIS is a Spigot / Paper plugin ...,tardis\n\ntardis is a spigot paper plugin th...,tardis\n\ntardis is a spigot paper plugin that...
1,1,kuralabs/reactive-core-doctor-who-web,JavaScript,==============================================...,\ndoctor who web app reactive core architectu...,doctor who web app reactive core architecture ...
2,2,fwallacephd/doctor-who,CSS,Notes about open source Doctor Who Project:\n\...,notes about open source doctor who project\n\n...,notes about open source doctor who project\n\n...
3,3,kuralabs/reactive-core-doctor-who-core,JavaScript,==============================================...,\ndoctor who reactive core reactive core arch...,doctor who reactive core reactive core archite...
4,4,kuralabs/reactive-core-doctor-who-mobile,JavaScript,==============================================...,\ndoctor who mobile app reactive core archite...,doctor who mobile app reactive core architectu...


In [15]:
def stem(string):
    '''In string from the basic_clean and tokenize fucntion
    creaters the porter stemmer
    applies the porter stemmer to every word in the string provided
    joing the list of words back into a string'''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    # Apply the stemmer to each word in our string
    stems = [ps.stem(word) for word in string.split()]
    # Join the list of words into the string
    string_stemmed = ' '.join(stems)
    # return string_stemmed
    return string_stemmed

In [16]:
dw['stemmed_content'] = dw.tokenized_content.apply(stem)
dw.head()

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents,cleaned_content,tokenized_content,stemmed_content
0,0,eccentricdevotion/TARDIS,Java,# TARDIS\n\nTARDIS is a Spigot / Paper plugin ...,tardis\n\ntardis is a spigot paper plugin th...,tardis\n\ntardis is a spigot paper plugin that...,tardi tardi is a spigot paper plugin that allo...
1,1,kuralabs/reactive-core-doctor-who-web,JavaScript,==============================================...,\ndoctor who web app reactive core architectu...,doctor who web app reactive core architecture ...,doctor who web app reactiv core architectur de...
2,2,fwallacephd/doctor-who,CSS,Notes about open source Doctor Who Project:\n\...,notes about open source doctor who project\n\n...,notes about open source doctor who project\n\n...,note about open sourc doctor who project thi s...
3,3,kuralabs/reactive-core-doctor-who-core,JavaScript,==============================================...,\ndoctor who reactive core reactive core arch...,doctor who reactive core reactive core archite...,doctor who reactiv core reactiv core architect...
4,4,kuralabs/reactive-core-doctor-who-mobile,JavaScript,==============================================...,\ndoctor who mobile app reactive core archite...,doctor who mobile app reactive core architectu...,doctor who mobil app reactiv core architectur ...


In [17]:
def lemmatize(string):
    '''Takes in string from basic_clean and tokenize funcitons
    creates a lematizer
    uses the lematizer on each word in the string
    merges the list of words back into string format
    and returns the now lematized string'''
    # Create the Lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    # Use the lemmatizer on each word using split
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    # Join the list into a string
    string_lemmatized = ' '.join(lemmas)
    # return lemmatized string
    return string_lemmatized

In [18]:
dw['lemma_content'] = dw.tokenized_content.apply(lemmatize)
dw.head()

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents,cleaned_content,tokenized_content,stemmed_content,lemma_content
0,0,eccentricdevotion/TARDIS,Java,# TARDIS\n\nTARDIS is a Spigot / Paper plugin ...,tardis\n\ntardis is a spigot paper plugin th...,tardis\n\ntardis is a spigot paper plugin that...,tardi tardi is a spigot paper plugin that allo...,tardis tardis is a spigot paper plugin that al...
1,1,kuralabs/reactive-core-doctor-who-web,JavaScript,==============================================...,\ndoctor who web app reactive core architectu...,doctor who web app reactive core architecture ...,doctor who web app reactiv core architectur de...,doctor who web app reactive core architecture ...
2,2,fwallacephd/doctor-who,CSS,Notes about open source Doctor Who Project:\n\...,notes about open source doctor who project\n\n...,notes about open source doctor who project\n\n...,note about open sourc doctor who project thi s...,note about open source doctor who project this...
3,3,kuralabs/reactive-core-doctor-who-core,JavaScript,==============================================...,\ndoctor who reactive core reactive core arch...,doctor who reactive core reactive core archite...,doctor who reactiv core reactiv core architect...,doctor who reactive core reactive core archite...
4,4,kuralabs/reactive-core-doctor-who-mobile,JavaScript,==============================================...,\ndoctor who mobile app reactive core archite...,doctor who mobile app reactive core architectu...,doctor who mobil app reactiv core architectur ...,doctor who mobile app reactive core architectu...


In [19]:
def remove_stopwords(string, exclude_words=[], extra_words=[]):
    '''takes in string from basic clean and tokenize fucntions
    takes in a list of words to exclude from the stopword list
    take sin a list of words to include in the stopword list
    makes the list of stopwords
    removes words listed from stopword list
    add words listed to stopword list
    remove words from stopword list from the string
    join words back to string format
    return new string'''
    # set stopword list 
    stopword_list = stopwords.words('english')
    # remove exclude_words list from stopword list
    stopword_list = set(stopword_list) - set(exclude_words)
    # add extra_wrods list to stopword list
    stopword_list = stopword_list.union(set(extra_words))
    # remove stopword list words from string
    string = string.split()
    # set filtered words value
    filtered_words = [word for word in string if word not in stopword_list]
    # join words back into string format 
    string = ' '.join(filtered_words)
    # return new string
    return string

In [20]:
document = dw.tokenized_content.iloc[0]
remove_stopwords(document, extra_words=['doctor', 'who', 'tardis', 'rose', 'tennent', 'matt', 'david', 'dalek', "'", 
             'forkshttpsimgshieldsiogithubforksjhabarsinghdocmedsvgstylesociallabelforkhttpsgithubcomjhabarsinghdocmednetwork',
             'hulu', 'timelord', 'weeping', 'angels', 'use'])

'spigot paper plugin allows create lets time travel teleport random locations adds whovian twist typical sethome home commands player create bigger inside time travel random location time travel saved destinations grow rooms take companions time travel collect artron energy power much plugin documentation found httpeccentricdevotiongithubiotardissitemaphtmlhttpeccentricdevotiongithubiotardissitemaphtml jenkins ci builds found httptardisjenkinsduckdnsorg8080jobtardishttptardisjenkinsduckdnsorg8080jobtardis'

In [21]:
extra_words=['doctor', 'who', 'tardis', 'rose', 'tennent', 'matt', 'david', 'dalek', "'", 
             'forkshttpsimgshieldsiogithubforksjhabarsinghdocmedsvgstylesociallabelforkhttpsgithubcomjhabarsinghdocmednetwork',
             'hulu', 'timelord', 'weeping', 'angels', 'use']
dw['no_stopwords_stem'] = dw.tokenized_content.apply(remove_stopwords, extra_words=extra_words)
dw.head()

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents,cleaned_content,tokenized_content,stemmed_content,lemma_content,no_stopwords_stem
0,0,eccentricdevotion/TARDIS,Java,# TARDIS\n\nTARDIS is a Spigot / Paper plugin ...,tardis\n\ntardis is a spigot paper plugin th...,tardis\n\ntardis is a spigot paper plugin that...,tardi tardi is a spigot paper plugin that allo...,tardis tardis is a spigot paper plugin that al...,spigot paper plugin allows create lets time tr...
1,1,kuralabs/reactive-core-doctor-who-web,JavaScript,==============================================...,\ndoctor who web app reactive core architectu...,doctor who web app reactive core architecture ...,doctor who web app reactiv core architectur de...,doctor who web app reactive core architecture ...,web app reactive core architecture demo applic...
2,2,fwallacephd/doctor-who,CSS,Notes about open source Doctor Who Project:\n\...,notes about open source doctor who project\n\n...,notes about open source doctor who project\n\n...,note about open sourc doctor who project thi s...,note about open source doctor who project this...,notes open source project site created open so...
3,3,kuralabs/reactive-core-doctor-who-core,JavaScript,==============================================...,\ndoctor who reactive core reactive core arch...,doctor who reactive core reactive core archite...,doctor who reactiv core reactiv core architect...,doctor who reactive core reactive core archite...,reactive core reactive core architecture demo ...
4,4,kuralabs/reactive-core-doctor-who-mobile,JavaScript,==============================================...,\ndoctor who mobile app reactive core archite...,doctor who mobile app reactive core architectu...,doctor who mobil app reactiv core architectur ...,doctor who mobile app reactive core architectu...,mobile app reactive core architecture demo app...


In [22]:
extra_words=['doctor', 'who', 'tardis', 'rose', 'tennent', 'matt', 'david', 'dalek', "'", 
             'forkshttpsimgshieldsiogithubforksjhabarsinghdocmedsvgstylesociallabelforkhttpsgithubcomjhabarsinghdocmednetwork',
             'hulu', 'timelord', 'weeping', 'angels', 'use']
dw['no_stopwords_lemma'] = dw.tokenized_content.apply(remove_stopwords, extra_words=extra_words)
dw.head()

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents,cleaned_content,tokenized_content,stemmed_content,lemma_content,no_stopwords_stem,no_stopwords_lemma
0,0,eccentricdevotion/TARDIS,Java,# TARDIS\n\nTARDIS is a Spigot / Paper plugin ...,tardis\n\ntardis is a spigot paper plugin th...,tardis\n\ntardis is a spigot paper plugin that...,tardi tardi is a spigot paper plugin that allo...,tardis tardis is a spigot paper plugin that al...,spigot paper plugin allows create lets time tr...,spigot paper plugin allows create lets time tr...
1,1,kuralabs/reactive-core-doctor-who-web,JavaScript,==============================================...,\ndoctor who web app reactive core architectu...,doctor who web app reactive core architecture ...,doctor who web app reactiv core architectur de...,doctor who web app reactive core architecture ...,web app reactive core architecture demo applic...,web app reactive core architecture demo applic...
2,2,fwallacephd/doctor-who,CSS,Notes about open source Doctor Who Project:\n\...,notes about open source doctor who project\n\n...,notes about open source doctor who project\n\n...,note about open sourc doctor who project thi s...,note about open source doctor who project this...,notes open source project site created open so...,notes open source project site created open so...
3,3,kuralabs/reactive-core-doctor-who-core,JavaScript,==============================================...,\ndoctor who reactive core reactive core arch...,doctor who reactive core reactive core archite...,doctor who reactiv core reactiv core architect...,doctor who reactive core reactive core archite...,reactive core reactive core architecture demo ...,reactive core reactive core architecture demo ...
4,4,kuralabs/reactive-core-doctor-who-mobile,JavaScript,==============================================...,\ndoctor who mobile app reactive core archite...,doctor who mobile app reactive core architectu...,doctor who mobil app reactiv core architectur ...,doctor who mobile app reactive core architectu...,mobile app reactive core architecture demo app...,mobile app reactive core architecture demo app...


In [23]:
dw['all_clean_stem'] = dw['no_stopwords_stem'] 

In [24]:
dw['all_clean_lemma'] = dw['no_stopwords_lemma'] 

In [25]:
dw.to_csv('clean_dw.csv')