In [1]:

from env import github_token, github_username
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from os import path
from requests import get
from sklearn.model_selection import train_test_split
from typing import Dict, List, Optional, Union, cast
from wordcloud import WordCloud


import json
import matplotlib.pyplot as plt
import nltk
import nltk.sentiment
import numpy as np
import os
import pandas as pd

import re
import requests
import seaborn as sns
import time
import unicodedata

In [2]:
large_scraped_data = pd.read_csv('large_scraped_data.csv')

In [3]:
large_scraped_data = large_scraped_data.dropna()

In [4]:
large_scraped_data = large_scraped_data.drop(columns='Unnamed: 0')

In [5]:
large_scraped_data

Unnamed: 0,repo,language,readme_contents
0,fogleman/Minecraft,Python,# Minecraft\n\nSimple Minecraft-inspired demo ...
1,itzg/docker-minecraft-server,Shell,[![Docker Pulls](https://img.shields.io/docker...
2,overviewer/Minecraft-Overviewer,Python,====================================\nMinecraf...
3,TheGreyGhost/MinecraftByExample,Java,MinecraftByExample [1.16.4]\n=================...
4,minecraft-dev/MinecraftDev,Kotlin,"<p align=""center""><a href=""https://minecraftde..."
...,...,...,...
879,Crossroads-Development/Crossroads,Java,# Crossroads\n\nA steampunk technology mod for...
880,GeyserMC/PacketLib,Java,# PacketLib\nPacketLib is a library for packet...
881,Nic4Las/Minecraft-Enderite-Mod,Java,# Enderite Mod\n\n[![](http://cf.way2muchnoise...
882,espertus/blockly-lua,JavaScript,Blockly Lua allows you to program ComputerCraf...


In [6]:
def basic_clean(string):
    '''
    This function accepts a string as an input
    then lowercases everything, normalizes unicode
    characters, and replaces anything that is
    not a letter, number, whitespace, 
    or a single quote.
    '''
    cleaned = string.lower()
    cleaned = unicodedata.normalize('NFKD', cleaned)    .encode('ascii', 'ignore')    .decode('utf-8', 'ignore')
    cleaned = re.sub(r"[^a-z0-9'\s]", '', cleaned)
    
    return cleaned


In [7]:
def tokenize(string):
    '''
    This function takes in a string as an input
    then tokenizes all words in the string.
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)


In [8]:
def stem(string):
    '''
    This function takes in a string as an input
    then stems all words in the string.
    '''
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    string_stemmed = ' '.join(stems)
    return string_stemmed


In [9]:
def lemmatize(string):
    '''
    This function takes in a string as an input
    then lemmatizes all words in the string.
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    string_lemmatized = ' '.join(lemmas)

    return string_lemmatized

In [10]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string as an input
    then removes stopwords. The function has two
    additional parameters that define additional
    stopwords to remove in extra_words as a list,
    and defines stopwords to exclude from removal
    in exlude_words as a list. extra_words and
    exclude_words are empty lists by default.
    '''
    stopword_list = stopwords.words('english')
    stopword_list = set(stopword_list) - set(exclude_words)
    stopword_list = stopword_list.union(set(extra_words))
    
    words = string.split()
    
    filtered_words = [w for w in words if w not in stopword_list]

    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')

    string_without_stopwords = ' '.join(filtered_words)

    return string_without_stopwords


In [11]:
extra_words = ['minecraft','abstract','and','arguments','assert','break','byte','case','char','class',
               'const','continue','default','double','else','enum','extends','false','final',
               'finally','float','for','goto','if','implements','import','in','instanceof',
               'int','interface','long','native','new','null','package','pass','private',
               'protected','public','raise','return','short','static','super','switch',
               'synchronized','this','throw','throws','transient','true','try','void',
               'volatile','while','with','yield']

In [12]:
extra_words

['minecraft',
 'abstract',
 'and',
 'arguments',
 'assert',
 'break',
 'byte',
 'case',
 'char',
 'class',
 'const',
 'continue',
 'default',
 'double',
 'else',
 'enum',
 'extends',
 'false',
 'final',
 'finally',
 'float',
 'for',
 'goto',
 'if',
 'implements',
 'import',
 'in',
 'instanceof',
 'int',
 'interface',
 'long',
 'native',
 'new',
 'null',
 'package',
 'pass',
 'private',
 'protected',
 'public',
 'raise',
 'return',
 'short',
 'static',
 'super',
 'switch',
 'synchronized',
 'this',
 'throw',
 'throws',
 'transient',
 'true',
 'try',
 'void',
 'volatile',
 'while',
 'with',
 'yield']

In [13]:
def prep_readme_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function takes in a df and the string name for a text column with 
    the option to pass lists for extra_words and exclude_words and
    returns a df with the repo name, original readme text, and cleaned-tokenized- 
    lemmatized readme text with stopwords removed.
    '''
    df = df.dropna()
    df['clean'] = df[column].apply(basic_clean)                            .apply(tokenize)                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    return df

In [14]:
df=prep_readme_data(large_scraped_data, 'readme_contents', extra_words= extra_words, exclude_words=[])

Removed 144 stopwords
---
Removed 4297 stopwords
---
Removed 201 stopwords
---
Removed 484 stopwords
---
Removed 146 stopwords
---
Removed 73 stopwords
---
Removed 8 stopwords
---
Removed 1313 stopwords
---
Removed 193 stopwords
---
Removed 33 stopwords
---
Removed 281 stopwords
---
Removed 214 stopwords
---
Removed 237 stopwords
---
Removed 239 stopwords
---
Removed 173 stopwords
---
Removed 218 stopwords
---
Removed 88 stopwords
---
Removed 65 stopwords
---
Removed 188 stopwords
---
Removed 148 stopwords
---
Removed 9 stopwords
---
Removed 129 stopwords
---
Removed 8 stopwords
---
Removed 186 stopwords
---
Removed 605 stopwords
---
Removed 2073 stopwords
---
Removed 74 stopwords
---
Removed 491 stopwords
---
Removed 583 stopwords
---
Removed 198 stopwords
---
Removed 628 stopwords
---
Removed 129 stopwords
---
Removed 69 stopwords
---
Removed 116 stopwords
---
Removed 237 stopwords
---
Removed 1108 stopwords
---
Removed 126 stopwords
---
Removed 207 stopwords
---
Removed 44 stopwords

In [15]:
df.head()

Unnamed: 0,repo,language,readme_contents,clean,lemmatized
0,fogleman/Minecraft,Python,# Minecraft\n\nSimple Minecraft-inspired demo ...,simple minecraftinspired demo written python p...,simple minecraftinspired demo written python p...
1,itzg/docker-minecraft-server,Shell,[![Docker Pulls](https://img.shields.io/docker...,docker pullshttpsimgshieldsiodockerpullsitzgmi...,docker pullshttpsimgshieldsiodockerpullsitzgmi...
2,overviewer/Minecraft-Overviewer,Python,====================================\nMinecraf...,overviewer build status andrew brown contribut...,overviewer build status andrew brown contribut...
3,TheGreyGhost/MinecraftByExample,Java,MinecraftByExample [1.16.4]\n=================...,minecraftbyexample 1164 purpose minecraftbyexa...,minecraftbyexample 1164 purpose minecraftbyexa...
4,minecraft-dev/MinecraftDev,Kotlin,"<p align=""center""><a href=""https://minecraftde...",p aligncentera hrefhttpsminecraftdevorgimg src...,p aligncentera hrefhttpsminecraftdevorgimg src...


In [16]:
df.shape

(884, 5)