In [1]:
import numpy as np
import pandas as pd

# acquire
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

# prepare
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# explore
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# model
from sklearn.linear_model import LogisticRegression
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report



In [2]:
import wrangle

credentials loaded successfully
Wrangle module loaded successfully.


In [3]:
df_raw = pd.read_json('repos.json')
df_raw.head()

Unnamed: 0,language,repo,content
0,JavaScript,facebook/react,React · \nReact is a JavaScript library for...
1,JavaScript,d3/d3,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...
2,JavaScript,vuejs/vue,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...
3,JavaScript,axios/axios,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...
4,JavaScript,facebook/create-react-app,Create React App \n\nCreate React apps with n...


In [4]:
df_raw.shape

(599, 3)

In [5]:
df_raw.drop_duplicates(subset = 'repo', ignore_index = True, inplace = True)

In [6]:
df_raw.shape

(581, 3)

In [7]:
language_labels = pd.concat([df_raw.language.value_counts(), # get total counts of ham vs spam
                    df_raw.language.value_counts(normalize=True)], axis=1) # getting the prop of ham vs. spam

language_labels.columns = ['n', 'percent']
language_labels

Unnamed: 0,n,percent
JavaScript,300,0.516351
Python,281,0.483649


In [8]:
import prepare

Prepare module loaded.


In [37]:
add_stopwords = ['file', 'import', 'use', 'return', 'also', 'code', 'using', 'see', 'install', 'default']

In [10]:
df = prepare.prep_repo_data(df_raw, 'content', extra_words = add_stopwords)

In [11]:
df.head()

Unnamed: 0,language,repo,content,stemmed,lemmatized,clean
0,JavaScript,facebook/react,React · \nReact is a JavaScript library for...,react react is a javascript librari for build ...,react react is a javascript library for buildi...,react react javascript library building user i...
1,JavaScript,d3/d3,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...,d3 datadriven document d3 or d3j is a javascri...,d3 datadriven document d3 or d3js is a javascr...,d3 datadriven document d3 d3js javascript libr...
2,JavaScript,vuejs/vue,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...,support vuej vuej is an mitlicens open sourc p...,supporting vuejs vuejs is an mitlicensed open ...,supporting vuejs vuejs mitlicensed open source...
3,JavaScript,axios/axios,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...,axio promis base http client for the browser a...,axios promise based http client for the browse...,axios promise based http client browser nodejs...
4,JavaScript,facebook/create-react-app,Create React App \n\nCreate React apps with n...,creat react app creat react app with no build ...,create react app create react apps with no bui...,create react app create react apps build confi...


In [12]:
# create variables to hold words that appear in each type of repo as a single string
js_words = ' '.join(df[df.language=='JavaScript'].clean)
py_words = ' '.join(df[df.language=='Python'].clean)
all_words = ' '.join(df.clean)

js_words = re.sub(r'\s.\s', '', js_words)
py_words = re.sub(r'\s.\s', '', py_words)
all_words = re.sub(r'\s.\s', '', all_words)

# how frequently each word appears
js_freq = pd.Series(js_words.split()).value_counts()
py_freq = pd.Series(py_words.split()).value_counts()
all_words_freq = pd.Series(all_words.split()).value_counts()

# Now creating the df that has all the word counts:

word_counts = (pd.concat([py_freq, js_freq], axis=1, sort=True)
               .set_axis(['python', 'js'], axis=1, inplace=False)
               .fillna(0)
               .apply(lambda s: s.astype(int))
              )

word_counts['all'] = word_counts['python'] + word_counts['js']
word_counts.head()

Unnamed: 0,python,js,all
&#9;,222,811,1033
&#9;&#9;,29,0,29
&#9;&#9;&#9;&#9;,28,1,29
&#9;&#9;&#9;&#9;&#9;&#9;&#9;&#9;2,0,1,1
&#9;&#9;&#9;&#9;&#9;&#9;consolelogxlsxutilssheet_to_csvwsfsrs,0,1,1


In [13]:
df = wrangle.prep_repo_data(df, 'content', extra_words = ['use', 'item', 'file', 'return'])
df.shape

(581, 5)

In [14]:
# add a column that is a list of each word for each repo --> PREPARE.PY
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df.clean] 

# column name will be words, and the column will contain lists of the words in each doc
df = pd.concat([df, pd.DataFrame({'words': words})], axis=1)
df.head()

Unnamed: 0,language,content,stemmed,lemmatized,clean,words
0,JavaScript,React · \nReact is a JavaScript library for...,react react is a javascript librari for build ...,react react is a javascript library for buildi...,react react javascript library building user i...,"[react, react, javascript, library, building, ..."
1,JavaScript,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...,d3 datadriven document d3 or d3j is a javascri...,d3 datadriven document d3 or d3js is a javascr...,d3 datadriven document d3 d3js javascript libr...,"[d3, datadriven, document, d3, d3js, javascrip..."
2,JavaScript,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...,support vuej vuej is an mitlicens open sourc p...,supporting vuejs vuejs is an mitlicensed open ...,supporting vuejs vuejs mitlicensed open source...,"[supporting, vuejs, vuejs, mitlicensed, open, ..."
3,JavaScript,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...,axio promis base http client for the browser a...,axios promise based http client for the browse...,axios promise based http client browser nodejs...,"[axios, promise, based, http, client, browser,..."
4,JavaScript,Create React App \n\nCreate React apps with n...,creat react app creat react app with no build ...,create react app create react apps with no bui...,create react app create react apps build confi...,"[create, react, app, create, react, apps, buil..."


In [15]:
df.isnull().sum()

language      0
content       0
stemmed       0
lemmatized    0
clean         0
words         0
dtype: int64

### Splitting the data

In [16]:
# X_train, y_train, X_validate, y_validate, X_test, y_test, train_explore, df_explore = wrangle.train_validate_test(df, 'language')

In [17]:
# X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape

In [18]:
from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(df[['language', 'clean']], 
                                        stratify=df.language, 
                                        test_size=.2)

train, validate = train_test_split(train_validate, 
                                   stratify=train_validate.language, 
                                   test_size=.25)

In [19]:
print(train.language.value_counts())
print(validate.language.value_counts())
print(test.language.value_counts())
train.head()

JavaScript    180
Python        168
Name: language, dtype: int64
JavaScript    60
Python        56
Name: language, dtype: int64
JavaScript    60
Python        57
Name: language, dtype: int64


Unnamed: 0,language,clean
555,Python,table contentsdata bin download 20162 github 7...
434,JavaScript,mustachejs logicless mustache template javascr...
67,JavaScript,fullpagejs english espaol franais p &#9; avail...
109,JavaScript,uppy uppy sleek modular javascript uploader in...
31,JavaScript,gatsby v2 fast every way matter gatsby free op...


## Modeling

#### Establishing a Baseline

In [21]:
train.language.value_counts()

JavaScript    180
Python        168
Name: language, dtype: int64

In [22]:
# Taking the most common result (JavaScript) and making that the baseline prediction.

baseline = round(len(train[train.language == 'JavaScript']) / len(train), 2)

In [23]:
#This is the % accuracy our model would be at if guessing a Readme was JavaScript every single time:

print(f"The baseline model's accuracy is: {baseline:.0%}")


The baseline model's accuracy is: 52%


### Bag of Words X_features

In [24]:
from sklearn.feature_extraction import text 

stop_words_eng = text.ENGLISH_STOP_WORDS
type(stop_words_eng)

frozenset

In [41]:
add_stopwords = ['file', 'import', 'use', 'return', 'also', 'code', 'using', 'see', 'install', 'default', '10', '100', '1000', '11', '12', '15', '20', '27', '30', '35', '36', '40', 'able', 'access', 'account', 'action', 'active', 'actually', 'add', 'avoid', 'awesome']

In [42]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(add_stopwords)

In [92]:
# Create CountVectorizer, which create bag-of-words model.
# stop_words : Specify language to remove stopwords. 
# min_df: ignore terms that have a document frequency strictly 
# lower than the given threshold. This value is also called cut-off in the literature. 
# If float, the parameter represents a proportion of documents, integer absolute counts. 
# ngram_range: the lower and upper boundary of the range of n-values for 
# different word n-grams or char n-grams to be extracted. 

vectorizer = CountVectorizer(stop_words=my_stop_words, 
                             min_df=10, 
                             ngram_range=(1,2), 
                             binary=True)

# Learn vocabulary in sentences. 
vectorizer.fit(train.clean)

# Get dictionary. 
vectorizer.get_feature_names()

['01',
 '05',
 '101',
 '120',
 '123',
 '13',
 '14',
 '16',
 '17',
 '18',
 '19',
 '200',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '21',
 '22',
 '23',
 '24',
 '25',
 '256',
 '26',
 '28',
 '2x',
 '300',
 '31',
 '32',
 '33',
 '34',
 '37',
 '38',
 '3d',
 '3x',
 '41',
 '42',
 '43',
 '45',
 '50',
 '500',
 '5000',
 '60',
 '64',
 '64bit',
 '80',
 '90',
 'ab',
 'ability',
 'absolute',
 'abstract',
 'abstraction',
 'accept',
 'accepts',
 'accessed',
 'accessible',
 'accessing',
 'according',
 'accuracy',
 'achieve',
 'achieved',
 'acknowledgement',
 'act',
 'activate',
 'actively',
 'activity',
 'actual',
 'ad',
 'adapter',
 'added',
 'adding',
 'addition',
 'additional',
 'additionally',
 'addons',
 'address',
 'adjust',
 'admin',
 'advanced',
 'advantage',
 'affect',
 'age',
 'agent',
 'agree',
 'ai',
 'aim',
 'ajax',
 'aka',
 'alert',
 'alexander',
 'algorithm',
 'alias',
 'allow',
 'allowed',
 'allowing',
 'allows',
 'alternative',
 'alternatively',
 'amazing',

In [93]:
# Transform each sentences in vector space

X_train_bow = vectorizer.transform(train.clean)
X_validate_bow = vectorizer.transform(validate.clean)
X_test_bow = vectorizer.transform(test.clean)

In [94]:
# Transform each sentences in vector space.
bow = vectorizer.transform(train.clean)

# this is just to see the array of 0's and 1's
X_train_bow_array = X_train_bow.toarray()
X_validate_bow_array = X_validate_bow.toarray()
X_test_bow_array = X_test_bow.toarray()
X_train_bow_array[0]

array([0, 0, 0, ..., 0, 0, 0])

In [95]:
# Creating the Y datasets:

y_train = train['language']
y_validate = validate['language']
y_test = test['language']

In [96]:
# Create the LogisticRegressioin object and fit on train
lm = LogisticRegression().fit(X_train_bow, y_train)

# Predict the labels for the train
y_pred = lm.predict(X_train_bow)

In [97]:
# Compute the confusion matrix
confusion_matrix(train.language, y_pred)

array([[180,   0],
       [  0, 168]])

In [98]:
# Compute the performance metrics
print(classification_report(train.language, y_pred))

              precision    recall  f1-score   support

  JavaScript       1.00      1.00      1.00       180
      Python       1.00      1.00      1.00       168

    accuracy                           1.00       348
   macro avg       1.00      1.00      1.00       348
weighted avg       1.00      1.00      1.00       348



In [128]:
# Validate Bag of Words

y_pred_bow = lm.predict(X_validate_bow)

# Compute the performance metrics for bow features. 
print(classification_report(validate.language, y_pred_bow))

              precision    recall  f1-score   support

  JavaScript       1.00      0.90      0.95        60
      Python       0.90      1.00      0.95        56

    accuracy                           0.95       116
   macro avg       0.95      0.95      0.95       116
weighted avg       0.95      0.95      0.95       116



In [129]:
# Testing BoWs

y_pred_bow = lm.predict(X_test_bow)

# Compute the performance metrics for bow features. 
print(classification_report(test.language, y_pred_bow))

              precision    recall  f1-score   support

  JavaScript       0.92      0.90      0.91        60
      Python       0.90      0.91      0.90        57

    accuracy                           0.91       117
   macro avg       0.91      0.91      0.91       117
weighted avg       0.91      0.91      0.91       117



### TD-IDF

In [113]:
# Transform the clean text into sparse matrix

tfidf = TfidfVectorizer(stop_words= my_stop_words, min_df=8, 
                        ngram_range=(1,2), 
                        binary=True)

# Fit on cleaned text in train
tfidf = tfidf.fit(train.clean)

# Get vocabularies.
tfidf.vocabulary_

{'table': 2700,
 'bin': 296,
 'download': 873,
 'github': 1203,
 '200': 16,
 'pdf': 1988,
 'jpg': 1481,
 '13': 8,
 'raw': 2200,
 'pull': 2158,
 'request': 2315,
 'yaml': 3025,
 'data': 698,
 'person': 2007,
 'company': 504,
 'family': 1062,
 'linux': 1595,
 'window': 2995,
 'issue': 1464,
 'python': 2168,
 'dot': 871,
 'python3': 2177,
 '26': 32,
 '34': 43,
 'wiki': 2989,
 'pypi': 2165,
 'star': 2602,
 'fork': 1136,
 'pull request': 2159,
 'linux window': 1598,
 'python python': 2174,
 'template': 2719,
 'javascript': 1473,
 'logic': 1625,
 'implementation': 1352,
 'syntax': 2697,
 'used': 2866,
 'html': 1318,
 'config': 547,
 'source': 2557,
 'work': 3002,
 'tag': 2702,
 'value': 2894,
 'provided': 2144,
 'hash': 1266,
 'object': 1873,
 'statement': 2611,
 'loop': 1636,
 'instead': 1420,
 'replaced': 2293,
 'series': 2468,
 'overview': 1943,
 'render': 2287,
 'includes': 1369,
 'web': 2956,
 'browser': 337,
 'serverside': 2472,
 'environment': 965,
 'nodejs': 1838,
 'view': 2919,
 'sh

In [114]:
# Transform the train and validate

X_train_tfidf = tfidf.transform(train.clean)
X_validate_tfidf = tfidf.transform(validate.clean)
X_test_tfidf = tfidf.transform(test.clean)

# Take a peek at 
X_train_tfidf

<348x3042 sparse matrix of type '<class 'numpy.float64'>'
	with 74643 stored elements in Compressed Sparse Row format>

In [115]:

pd.DataFrame(X_train_tfidf.todense(), columns=tfidf.get_feature_names()).head()

Unnamed: 0,01,05,06,10000,101,1024,120,123,13,14,...,youd like,youll,youll need,youre,youre looking,youtube,youve,zero,zhang,zip
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19865,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.056341,0.046304,0.057871,0.0,0.0,0.0,0.0,0.062086,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.050808,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.052383,0.043052,0.053806,0.03739,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.050307,0.0,0.0,0.070481,0.077668,0.0,0.0


In [116]:
X_train_tfidfarray = X_train_tfidf.toarray()
X_validate_tfidfarray = X_validate_tfidf.toarray()
X_test_tfidfarray = X_test_tfidf.toarray()

In [117]:
y_train = train['language']
y_validate = validate['language']
y_test = test['language']

In [118]:
# Fit the LogisticRegression on TFIDF features
lm_tfidf = LogisticRegression().fit(X_train_tfidf, y_train)

# Predic the labels
y_pred = lm_tfidf.predict(X_train_tfidf)

In [119]:
# Compute the confusion matrix
confusion_matrix(train.language, y_pred)

array([[179,   1],
       [  3, 165]])

In [120]:
# Compute the performance metrics
print(classification_report(train.language, y_pred))

              precision    recall  f1-score   support

  JavaScript       0.98      0.99      0.99       180
      Python       0.99      0.98      0.99       168

    accuracy                           0.99       348
   macro avg       0.99      0.99      0.99       348
weighted avg       0.99      0.99      0.99       348



### Validate

In [121]:
y_pred_tfidf = lm_tfidf.predict(X_validate_tfidf)

In [123]:
# Compute the performance metrics for tfidf features.
print(classification_report(validate.language, y_pred_tfidf))

              precision    recall  f1-score   support

  JavaScript       0.95      0.93      0.94        60
      Python       0.93      0.95      0.94        56

    accuracy                           0.94       116
   macro avg       0.94      0.94      0.94       116
weighted avg       0.94      0.94      0.94       116



### Test

In [130]:
y_pred_tfidf = lm_tfidf.predict(X_test_tfidf)

In [126]:
# Compute the performance metrics for tfidf features.
print(classification_report(test.language, y_pred_tfidf))

              precision    recall  f1-score   support

  JavaScript       0.94      0.98      0.96        60
      Python       0.98      0.93      0.95        57

    accuracy                           0.96       117
   macro avg       0.96      0.96      0.96       117
weighted avg       0.96      0.96      0.96       117

