In [1]:
from env import github_token, github_username
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from os import path
from requests import get
from sklearn.model_selection import train_test_split
from typing import Dict, List, Optional, Union, cast
from wordcloud import WordCloud


import json
import matplotlib.pyplot as plt
import nltk
import nltk.sentiment
import numpy as np
import os
import pandas as pd

import re
import requests
import seaborn as sns
import time
import unicodedata

In [2]:
large_scraped_data = pd.read_csv('large_scraped_data.csv')

In [3]:
large_scraped_data = large_scraped_data.dropna()

In [4]:
large_scraped_data = large_scraped_data.drop(columns='Unnamed: 0')

In [5]:
large_scraped_data

Unnamed: 0,repo,language,readme_contents
0,fogleman/Minecraft,Python,# Minecraft\n\nSimple Minecraft-inspired demo ...
1,itzg/docker-minecraft-server,Shell,[![Docker Pulls](https://img.shields.io/docker...
2,overviewer/Minecraft-Overviewer,Python,====================================\nMinecraf...
3,TheGreyGhost/MinecraftByExample,Java,MinecraftByExample [1.16.4]\n=================...
4,minecraft-dev/MinecraftDev,Kotlin,"<p align=""center""><a href=""https://minecraftde..."
...,...,...,...
879,Crossroads-Development/Crossroads,Java,# Crossroads\n\nA steampunk technology mod for...
880,GeyserMC/PacketLib,Java,# PacketLib\nPacketLib is a library for packet...
881,Nic4Las/Minecraft-Enderite-Mod,Java,# Enderite Mod\n\n[![](http://cf.way2muchnoise...
882,espertus/blockly-lua,JavaScript,Blockly Lua allows you to program ComputerCraf...


In [6]:
def basic_clean(string):
    '''
    This function accepts a string as an input
    then lowercases everything, normalizes unicode
    characters, and replaces anything that is
    not a letter, number, whitespace, 
    or a single quote.
    '''
    cleaned = string.lower()
    cleaned = unicodedata.normalize('NFKD', cleaned).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    cleaned = re.sub(r"[^a-z\s]", ' ', cleaned)
    
    return cleaned


In [7]:
def tokenize(string):
    '''
    This function takes in a string as an input
    then tokenizes all words in the string.
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)


In [8]:
def stem(string):
    '''
    This function takes in a string as an input
    then stems all words in the string.
    '''
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    string_stemmed = ' '.join(stems)
    return string_stemmed


In [9]:
def lemmatize(string):
    '''
    This function takes in a string as an input
    then lemmatizes all words in the string.
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    string_lemmatized = ' '.join(lemmas)

    return string_lemmatized

In [10]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string as an input
    then removes stopwords. The function has two
    additional parameters that define additional
    stopwords to remove in extra_words as a list,
    and defines stopwords to exclude from removal
    in exlude_words as a list. extra_words and
    exclude_words are empty lists by default.
    '''
    stopword_list = stopwords.words('english')
    stopword_list = set(stopword_list) - set(exclude_words)
    stopword_list = stopword_list.union(set(extra_words))
    
    words = string.split()
    
    filtered_words = [w for w in words if w not in stopword_list]

    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')

    string_without_stopwords = ' '.join(filtered_words)

    return string_without_stopwords


In [11]:
def map_other_languages(df):
    '''
    This function takes in a df with 'languages' column
    containing the coding language of the repo. Any language
    that is not Python, Java, or JavaScript will be marked
    as 'Other'
    '''
    top_languages = ['Python', 'Java', 'JavaScript']
    df.loc[~df['language'].isin(top_languages), 'language'] = 'Other'
    
    return df

In [12]:
extra_words = ['minecraft','abstract','and','arguments','assert','break','byte','case','char','class',
               'const','continue','default','double','else','enum','extends','false','final','finally','float','for',
               'goto','if','implements','import','in','instanceof','int','interface','long','native','new','null',
               'package','pass','private','protected','public','raise','return','short','static','super','switch',
               'synchronized','this','throw','throws','transient','true','try','void','volatile','while','with',
               'yield', 'http', 'com', 'github', 'www', 'server', 'version', 'mod', 'file']

In [13]:
extra_words

['http ',
 'minecraft',
 'abstract',
 'and',
 'arguments',
 'assert',
 'break',
 'byte',
 'case',
 'char',
 'class',
 'const',
 'continue',
 'default',
 'double',
 'else',
 'enum',
 'extends',
 'false',
 'final',
 'finally',
 'float',
 'for',
 'goto',
 'if',
 'implements',
 'import',
 'in',
 'instanceof',
 'int',
 'interface',
 'long',
 'native',
 'new',
 'null',
 'package',
 'pass',
 'private',
 'protected',
 'public',
 'raise',
 'return',
 'short',
 'static',
 'super',
 'switch',
 'synchronized',
 'this',
 'throw',
 'throws',
 'transient',
 'true',
 'try',
 'void',
 'volatile',
 'while',
 'with',
 'yield',
 'http',
 'com',
 'github',
 'www',
 'server',
 'version',
 'mod',
 'file']

In [14]:
def prep_readme_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function takes in a df and the string name for a text column with 
    the option to pass lists for extra_words and exclude_words and
    returns a df with the repo name, original readme text, and cleaned-tokenized- 
    lemmatized readme text with stopwords removed.
    '''
    df = df.dropna()
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords,
                                  extra_words=extra_words,
                                  exclude_words=exclude_words)
    
    
    df['lemmatized'] = df['clean'].apply(lemmatize)
    
    df = map_other_languages(df)
    
    return df

In [15]:
df=prep_readme_data(large_scraped_data, 'readme_contents', extra_words= extra_words, exclude_words=[])

Removed 167 stopwords
---
Removed 5194 stopwords
---
Removed 222 stopwords
---
Removed 639 stopwords
---
Removed 218 stopwords
---
Removed 85 stopwords
---
Removed 16 stopwords
---
Removed 1574 stopwords
---
Removed 266 stopwords
---
Removed 39 stopwords
---
Removed 359 stopwords
---
Removed 300 stopwords
---
Removed 293 stopwords
---
Removed 261 stopwords
---
Removed 216 stopwords
---
Removed 250 stopwords
---
Removed 96 stopwords
---
Removed 83 stopwords
---
Removed 328 stopwords
---
Removed 167 stopwords
---
Removed 11 stopwords
---
Removed 150 stopwords
---
Removed 18 stopwords
---
Removed 240 stopwords
---
Removed 736 stopwords
---
Removed 2286 stopwords
---
Removed 116 stopwords
---
Removed 554 stopwords
---
Removed 747 stopwords
---
Removed 227 stopwords
---
Removed 691 stopwords
---
Removed 142 stopwords
---
Removed 71 stopwords
---
Removed 172 stopwords
---
Removed 272 stopwords
---
Removed 1202 stopwords
---
Removed 143 stopwords
---
Removed 213 stopwords
---
Removed 89 stopw

In [16]:
df.head()

Unnamed: 0,repo,language,readme_contents,clean,lemmatized
0,fogleman/Minecraft,Python,# Minecraft\n\nSimple Minecraft-inspired demo ...,simple inspired demo written python pyglet you...,simple inspired demo written python pyglet you...
1,itzg/docker-minecraft-server,Other,[![Docker Pulls](https://img.shields.io/docker...,docker pulls https img shields io docker pulls...,docker pull http img shield io docker pull itz...
2,overviewer/Minecraft-Overviewer,Python,====================================\nMinecraf...,overviewer build status andrew brown contribut...,overviewer build status andrew brown contribut...
3,TheGreyGhost/MinecraftByExample,Java,MinecraftByExample [1.16.4]\n=================...,minecraftbyexample purpose minecraftbyexample ...,minecraftbyexample purpose minecraftbyexample ...
4,minecraft-dev/MinecraftDev,Other,"<p align=""center""><a href=""https://minecraftde...",p align center href https minecraftdev org img...,p align center href http minecraftdev org img ...


In [17]:
df.shape

(884, 5)

In [18]:
df.language.value_counts()

Java          402
Other         320
JavaScript     86
Python         76
Name: language, dtype: int64

In [19]:
def split_minecraft_data(df):
    '''
    This function performs split on minecraft repo data, stratified on language.
    Returns train, validate, and test dfs.
    '''
    train_validate, test = train_test_split(df, test_size=.2, 
                                        random_state=123, 
                                        stratify=df.language)
    train, validate = train_test_split(train_validate, test_size=.3, 
                                   random_state=123, 
                                   stratify=train_validate.language)
    return train, validate, test



In [20]:
train, validate, test = split_minecraft_data(df)

In [21]:
train

Unnamed: 0,repo,language,readme_contents,clean,lemmatized
40,danba340/minecraft-freecodecamp,JavaScript,# Welcome to React three fiber Minecraft\n\nTh...,welcome react three fiber project trying mimic...,welcome react three fiber project trying mimic...
730,BentoBoxWorld/AOneBlock,Java,"# AOneBlock\nA OneBlock Minecraft plugin, writ...",aoneblock oneblock plugin written tastybento c...,aoneblock oneblock plugin written tastybento c...
243,PGMDev/PGM,Java,PGM ![deploy](https://github.com/Electroid/PGM...,pgm deploy https electroid pgm workflows deplo...,pgm deploy http electroid pgm workflow deploy ...
796,SandboxPowered/Sandbox,Java,"<p align=""center"">\n <img src=""https://raw....",p align center img src https raw githubusercon...,p align center img src http raw githubusercont...
232,mraof/Minestuck,Java,Minestuck\n=========\n\n* Discord Server: http...,minestuck discord https discordapp invite tekw...,minestuck discord http discordapp invite tekwv...
...,...,...,...,...,...
128,danba340/minecraft-react,JavaScript,# Welcome to React three fiber Minecraft\n\nTh...,welcome react three fiber project trying mimic...,welcome react three fiber project trying mimic...
51,DonBruce64/MinecraftTransportSimulator,Java,[![image](https://user-images.githubuserconten...,image https user images githubusercontent db b...,image http user image githubusercontent db bee...
314,narumii/MinecraftCrasher,Java,> Java version: AdoptJDK 11 openJ9\n---\n\n###...,java adoptjdk openj crasher handshake crasher ...,java adoptjdk openj crasher handshake crasher ...
563,deoxxa/jsmc,JavaScript,"jsmc\n====\n\nHaha, who would write a Minecraf...",jsmc haha would write javascript anyway overvi...,jsmc haha would write javascript anyway overvi...


In [22]:
lemma_readmes = ' '.join(train['lemmatized'])

In [23]:
lemma_readmes



In [24]:
lemma_freq = pd.Series(lemma_readmes.split()).value_counts()

In [25]:
lemma_freq.head(25)

http       4684
li          988
build       900
&#9;        843
img         842
use         807
io          732
run         706
command     699
project     672
license     643
discord     642
png         625
org         586
block       584
world       578
player      574
issue       569
install     566
code        565
x           528
using       521
src         499
support     495
java        488
dtype: int64

In [26]:
dfpy = train[train.language == 'Python']
py_readmes = ' '.join(dfpy['lemmatized'])

In [27]:
dfjv = train[train.language == 'Java']
java_readmes = ' '.join(dfjv['lemmatized'])

In [28]:
dfjs = train[train.language == 'JavaScript']
js_readmes = ' '.join(dfjs['lemmatized'])

In [29]:
py_words = pd.Series(py_readmes.split()).value_counts()

In [30]:
py_words

http         327
python       183
install      156
py           124
image        112
            ... 
medium         1
pm             1
straight       1
modifying      1
phosphor       1
Length: 2789, dtype: int64

In [31]:
java_words = pd.Series(java_readmes.split()).value_counts()

In [32]:
java_words

http                2042
build                467
discord              380
license              363
project              350
                    ... 
ophl                   1
specially              1
cloudflare             1
redirects              1
misunderstanding       1
Length: 8916, dtype: int64

In [33]:
javascript_words = pd.Series(js_readmes.split()).value_counts()

In [34]:
javascript_words

http          423
&#9;          334
command       165
j             124
backup        106
             ... 
ihre            1
einbauen        1
testserver      1
expansion       1
adn             1
Length: 3794, dtype: int64

In [40]:
all_minecraft_bigrams = (pd.Series(nltk.ngrams(lemma_readmes.split(), 2)).value_counts())

In [41]:
all_minecraft_bigrams

(http, img)                   481
(shield, io)                  469
(img, shield)                 466
(&#9;, &#9;)                  431
(li, li)                      254
                             ... 
(added, user)                   1
(without, setupminecraft)       1
(start, set)                    1
(others, misunderstanding)      1
Length: 103317, dtype: int64

In [42]:
py_bigrams = (pd.Series(nltk.ngrams(py_readmes.split(), 2)).value_counts())

In [43]:
py_bigrams

(fabulously, optimized)    61
(pip, install)             41
(chunk, info)              33
(img, src)                 31
(br, br)                   31
                           ..
(status, gfy)               1
(gfy, n)                    1
(n, sl)                     1
(sl, svg)                   1
(kwak, quarry)              1
Length: 10744, dtype: int64

In [44]:
jv_bigrams = (pd.Series(nltk.ngrams(java_readmes.split(), 2)).value_counts())

In [45]:
jv_bigrams

(http, img)                   227
(shield, io)                  224
(img, shield)                 221
(http, curseforge)            112
(http, discord)               104
                             ... 
(fun, useful)                   1
(creating, fun)                 1
(vehicle, creating)             1
(mod, mrcrayfishs)              1
(others, misunderstanding)      1
Length: 41009, dtype: int64

In [46]:
js_bigrams = (pd.Series(nltk.ngrams(js_readmes.split(), 2)).value_counts())

In [47]:
js_bigrams

(&#9;, &#9;)                225
(http, img)                  49
(img, shield)                43
(shield, io)                 43
(wzh, minecraftweb)          35
                           ... 
(und, video)                  1
(stevertus, de)               1
(de, english)                 1
(english, documentation)      1
(mailto, deoxxa)              1
Length: 13720, dtype: int64

In [50]:
py_trigrams = (pd.Series(nltk.ngrams(py_readmes.split(), 3)).value_counts())

In [51]:
py_trigrams

(http, fabulously, optimized)             24
(http, img, shield)                       22
(img, shield, io)                         22
(span, style, font)                       21
(style, font, weight)                     15
                                          ..
(available, used, description)             1
(used, description, production)            1
(description, production, instruction)     1
(production, instruction, getting)         1
(kwak, quarry, protocol)                   1
Length: 12508, dtype: int64

In [53]:
jv_trigrams = (pd.Series(nltk.ngrams(java_readmes.split(), 3)).value_counts())

In [54]:
jv_trigrams

(img, shield, io)                    221
(http, img, shield)                  220
(http, discord, gg)                   96
(curseforge, mc, mod)                 85
(http, curseforge, mc)                83
                                    ... 
(many, bedrock, edition)               1
(edition, feature, toggleable)         1
(feature, toggleable, fly)             1
(toggleable, fly, drifting)            1
(avoid, others, misunderstanding)      1
Length: 50245, dtype: int64

In [55]:
js_trigrams = (pd.Series(nltk.ngrams(js_readmes.split(), 3)).value_counts())

In [56]:
js_trigrams

(&#9;, &#9;, &#9;)              185
(http, img, shield)              43
(img, shield, io)                43
(http, wzh, minecraftweb)        29
(wzh, minecraftweb, release)     23
                               ... 
(badge, dynamic, json)            1
(dynamic, json, label)            1
(json, label, discord)            1
(label, discord, color)           1
(mailto, deoxxa, fknsrs)          1
Length: 16117, dtype: int64