In [1]:
import numpy as np
import pandas as pd

# acquire
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

# prepare
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# explore
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# model
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report



In [2]:
import wrangle

credentials loaded successfully
Wrangle module loaded successfully.


In [3]:
df_raw = pd.read_json('repos.json')
df_raw.head()

Unnamed: 0,language,repo,content
0,JavaScript,facebook/react,React · \nReact is a JavaScript library for...
1,JavaScript,d3/d3,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...
2,JavaScript,vuejs/vue,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...
3,JavaScript,axios/axios,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...
4,JavaScript,facebook/create-react-app,Create React App \n\nCreate React apps with n...


In [4]:
df_raw.shape

(599, 3)

In [5]:
df_raw.drop_duplicates(subset = 'repo', ignore_index = True, inplace = True)

In [6]:
df_raw.shape

(581, 3)

In [7]:
language_labels = pd.concat([df_raw.language.value_counts(), # get total counts of ham vs spam
                    df_raw.language.value_counts(normalize=True)], axis=1) # getting the prop of ham vs. spam

language_labels.columns = ['n', 'percent']
language_labels

Unnamed: 0,n,percent
JavaScript,300,0.516351
Python,281,0.483649


In [8]:
import prepare

Prepare module loaded.


In [9]:
add_stopwords = ['file', 'import', 'use', 'return', 'also', 'code', 'using', 'see', 'install', 'default']

In [10]:
df = prepare.prep_repo_data(df_raw, 'content', extra_words = add_stopwords)

In [11]:
df.head()

Unnamed: 0,language,repo,content,stemmed,lemmatized,clean
0,JavaScript,facebook/react,React · \nReact is a JavaScript library for...,react react is a javascript librari for build ...,react react is a javascript library for buildi...,react react javascript library building user i...
1,JavaScript,d3/d3,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...,d3 datadriven document d3 or d3j is a javascri...,d3 datadriven document d3 or d3js is a javascr...,d3 datadriven document d3 d3js javascript libr...
2,JavaScript,vuejs/vue,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...,support vuej vuej is an mitlicens open sourc p...,supporting vuejs vuejs is an mitlicensed open ...,supporting vuejs vuejs mitlicensed open source...
3,JavaScript,axios/axios,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...,axio promis base http client for the browser a...,axios promise based http client for the browse...,axios promise based http client browser nodejs...
4,JavaScript,facebook/create-react-app,Create React App \n\nCreate React apps with n...,creat react app creat react app with no build ...,create react app create react apps with no bui...,create react app create react apps build confi...


In [12]:
# create variables to hold words that appear in each type of repo as a single string
js_words = ' '.join(df[df.language=='JavaScript'].clean)
py_words = ' '.join(df[df.language=='Python'].clean)
all_words = ' '.join(df.clean)

js_words = re.sub(r'\s.\s', '', js_words)
py_words = re.sub(r'\s.\s', '', py_words)
all_words = re.sub(r'\s.\s', '', all_words)

# how frequently each word appears
js_freq = pd.Series(js_words.split()).value_counts()
py_freq = pd.Series(py_words.split()).value_counts()
all_words_freq = pd.Series(all_words.split()).value_counts()

# Now creating the df that has all the word counts:

word_counts = (pd.concat([py_freq, js_freq], axis=1, sort=True)
               .set_axis(['python', 'js'], axis=1, inplace=False)
               .fillna(0)
               .apply(lambda s: s.astype(int))
              )

word_counts['all'] = word_counts['python'] + word_counts['js']
word_counts.head()

Unnamed: 0,python,js,all
&#9;,222,811,1033
&#9;&#9;,29,0,29
&#9;&#9;&#9;&#9;,28,1,29
&#9;&#9;&#9;&#9;&#9;&#9;&#9;&#9;2,0,1,1
&#9;&#9;&#9;&#9;&#9;&#9;consolelogxlsxutilssheet_to_csvwsfsrs,0,1,1


In [13]:
df = wrangle.prep_repo_data(df, 'content', extra_words = ['use', 'item', 'file', 'return'])
df.shape

(581, 5)

In [14]:
# add a column that is a list of each word for each repo --> PREPARE.PY
words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df.clean] 

# column name will be words, and the column will contain lists of the words in each doc
df = pd.concat([df, pd.DataFrame({'words': words})], axis=1)
df.head()

Unnamed: 0,language,content,stemmed,lemmatized,clean,words
0,JavaScript,React · \nReact is a JavaScript library for...,react react is a javascript librari for build ...,react react is a javascript library for buildi...,react react javascript library building user i...,"[react, react, javascript, library, building, ..."
1,JavaScript,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...,d3 datadriven document d3 or d3j is a javascri...,d3 datadriven document d3 or d3js is a javascr...,d3 datadriven document d3 d3js javascript libr...,"[d3, datadriven, document, d3, d3js, javascrip..."
2,JavaScript,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...,support vuej vuej is an mitlicens open sourc p...,supporting vuejs vuejs is an mitlicensed open ...,supporting vuejs vuejs mitlicensed open source...,"[supporting, vuejs, vuejs, mitlicensed, open, ..."
3,JavaScript,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...,axio promis base http client for the browser a...,axios promise based http client for the browse...,axios promise based http client browser nodejs...,"[axios, promise, based, http, client, browser,..."
4,JavaScript,Create React App \n\nCreate React apps with n...,creat react app creat react app with no build ...,create react app create react apps with no bui...,create react app create react apps build confi...,"[create, react, app, create, react, apps, buil..."


In [15]:
df.isnull().sum()

language      0
content       0
stemmed       0
lemmatized    0
clean         0
words         0
dtype: int64

### Splitting the data

In [None]:
# X_train, y_train, X_validate, y_validate, X_test, y_test, train_explore, df_explore = wrangle.train_validate_test(df, 'language')

In [None]:
# X_train.shape, y_train.shape, X_validate.shape, y_validate.shape, X_test.shape, y_test.shape

In [16]:
from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(df[['language', 'clean']], 
                                        stratify=df.language, 
                                        test_size=.2)

train, validate = train_test_split(train_validate, 
                                   stratify=train_validate.language, 
                                   test_size=.25)

In [17]:
print(train.language.value_counts())
print(validate.language.value_counts())
print(test.language.value_counts())
train.head()

JavaScript    180
Python        168
Name: language, dtype: int64
JavaScript    60
Python        56
Name: language, dtype: int64
JavaScript    60
Python        57
Name: language, dtype: int64


Unnamed: 0,language,clean
558,Python,english doc window 10x64 python 364 opencv 341...
563,Python,tweepy twitter python installation easiest way...
393,JavaScript,w nodejs websocket library w simple blazing fa...
123,JavaScript,nodemon nodemon tool help develop nodejs based...
151,Python,flask flask lightweight wsgi web application f...


## Modeling

#### Establishing a Baseline

In [19]:
train.language.value_counts()

JavaScript    180
Python        168
Name: language, dtype: int64

In [23]:
baseline_count = pd.concat([train.language.value_counts(), train.language.value_counts(normalize = True)], axis = 1)
baseline_count.columns = ['count', 'percent']
baseline_count

Unnamed: 0,count,percent
JavaScript,180,0.517241
Python,168,0.482759


In [24]:
# Taking the most common result (JavaScript) and making that the baseline prediction.
#This is the % accuracy our model would be at if guessing a Readme was JavaScript every single time:
baseline = round(len(train[train.language == 'JavaScript']) / len(train), 2)

print(f"The baseline model's accuracy is: {baseline:.0%}")


The baseline model's accuracy is: 52%


It appears that JavaScript is the most often occuring result of the two languages represented, thus we will take as our baseline assuming that all README's are in JavaScript, which would mean our baseline model is accurately approximately 52% of the time.

### Logistic Regression using Bag of Words to identify features

Doing some last minute cleanup and adding to the stopwords list for Bag of Words and TF-IDF feature selection:

In [25]:
from sklearn.feature_extraction import text 

stop_words_eng = text.ENGLISH_STOP_WORDS
type(stop_words_eng)

frozenset

In [26]:
add_stopwords = ['file', 'import', 'use', 'return', 'also', 'code', 'using', 'see', 'install', 'default', '10', '100', '1000', '11', '12', '15', '20', '27', '30', '35', '36', '40', 'able', 'access', 'account', 'action', 'active', 'actually', 'add', 'avoid', 'awesome']

In [27]:
my_stop_words = text.ENGLISH_STOP_WORDS.union(add_stopwords)

In [28]:
# Create CountVectorizer, which create bag-of-words model.
# stop_words : Specify language to remove stopwords. 
# min_df: ignore terms that have a document frequency strictly 
# lower than the given threshold. This value is also called cut-off in the literature. 
# If float, the parameter represents a proportion of documents, integer absolute counts. 
# ngram_range: the lower and upper boundary of the range of n-values for 
# different word n-grams or char n-grams to be extracted. 

vectorizer_bow = CountVectorizer(stop_words=my_stop_words, 
                             min_df=10, 
                             ngram_range=(1,2), 
                             binary=True)

# Learn vocabulary in sentences. 
vectorizer_bow.fit(train.clean)

# Get dictionary. 
vectorizer_bow.get_feature_names()

['01',
 '02',
 '05',
 '101',
 '123',
 '13',
 '14',
 '16',
 '17',
 '18',
 '19',
 '200',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '21',
 '22',
 '23',
 '24',
 '25',
 '256',
 '26',
 '28',
 '2d',
 '2x',
 '300',
 '3000',
 '31',
 '32',
 '33',
 '34',
 '37',
 '37 38',
 '38',
 '3x',
 '400',
 '42',
 '43',
 '50',
 '500',
 '55',
 '56',
 '60',
 '64',
 '80',
 '90',
 'ab',
 'ability',
 'absolute',
 'abstract',
 'abstraction',
 'accept',
 'accepts',
 'accessed',
 'accessible',
 'accessing',
 'according',
 'accuracy',
 'achieve',
 'acknowledgement',
 'act',
 'activate',
 'actively',
 'activity',
 'actual',
 'ad',
 'adapter',
 'added',
 'adding',
 'adding new',
 'addition',
 'additional',
 'additionally',
 'addons',
 'address',
 'adjust',
 'admin',
 'administrator',
 'advanced',
 'advantage',
 'affect',
 'age',
 'agree',
 'ai',
 'aim',
 'ajax',
 'aka',
 'alexander',
 'algorithm',
 'alias',
 'allow',
 'allowed',
 'allowing',
 'allows',
 'alternative',
 'alternatively',
 'am

In [29]:
# Transform each sentences in vector space

X_train_bow = vectorizer_bow.transform(train.clean)
X_validate_bow = vectorizer_bow.transform(validate.clean)
X_test_bow = vectorizer_bow.transform(test.clean)

In [30]:
# Transform each sentences in vector space.
bow = vectorizer_bow.transform(train.clean)

# this is just to see the array of 0's and 1's
X_train_bow_array = X_train_bow.toarray()
X_validate_bow_array = X_validate_bow.toarray()
X_test_bow_array = X_test_bow.toarray()
X_train_bow_array[0]

array([0, 0, 0, ..., 0, 0, 0])

In [31]:
# Creating the Y datasets:

y_train = train['language']
y_validate = validate['language']
y_test = test['language']

In [32]:
# Create the LogisticRegressioin object and fit on train
lm = LogisticRegression().fit(X_train_bow, y_train)

# Predict the labels for the train
y_pred = lm.predict(X_train_bow)

In [33]:
# Compute the confusion matrix
confusion_matrix(train.language, y_pred)

array([[180,   0],
       [  0, 168]])

In [34]:
# Compute the performance metrics
print(classification_report(train.language, y_pred))

              precision    recall  f1-score   support

  JavaScript       1.00      1.00      1.00       180
      Python       1.00      1.00      1.00       168

    accuracy                           1.00       348
   macro avg       1.00      1.00      1.00       348
weighted avg       1.00      1.00      1.00       348



### BoW Validate

In [None]:
# Validate Bag of Words

y_pred_bow = lm.predict(X_validate_bow)

# Compute the performance metrics for bow features. 
print(classification_report(validate.language, y_pred_bow))

In [None]:
# Testing BoWs

y_pred_bow = lm.predict(X_test_bow)

# Compute the performance metrics for bow features. 
print(classification_report(test.language, y_pred_bow))

### Logistic Regression using TF-IDF to identify features

In [None]:
# Transform the clean text into sparse matrix

tfidf = TfidfVectorizer(stop_words= my_stop_words, min_df=15, max_df = .10,
                        ngram_range=(1,2), 
                        binary=True)

# Fit on cleaned text in train
tfidf = tfidf.fit(train.clean)

# Get vocabularies.
tfidf.vocabulary_

In [None]:
# Creating and transforming the X features models.

X_train_tfidf = tfidf.transform(train.clean)
X_validate_tfidf = tfidf.transform(validate.clean)
X_test_tfidf = tfidf.transform(test.clean)
X_train_tfidf

##### TF-IDF Matrix

The following dataframe shows all the words being used as features in the model, and their weight within that modeling. In other words, the model will look at the following words which have been vectorized, and using them as a way to weight their tendancy towards either being more likely to show up in a Python related README, or a JavaScript related README.

In [None]:
pd.DataFrame(X_train_tfidf.todense(), columns=tfidf.get_feature_names()).head()

In [None]:
X_train_tfidfarray = X_train_tfidf.toarray()
X_validate_tfidfarray = X_validate_tfidf.toarray()
X_test_tfidfarray = X_test_tfidf.toarray()

In [None]:
y_train = train['language']
y_validate = validate['language']
y_test = test['language']

In [None]:
# Fitting the model:
lm_tfidf = LogisticRegression().fit(X_train_tfidf, y_train)

# Using the model to predict the languages:
y_pred = lm_tfidf.predict(X_train_tfidf)

In [None]:
# The confusion matrix shows which results were True positive, True Negative, False Positive, and False Negative predictions
confusion_matrix(train.language, y_pred)

In [None]:
# The classification report shows the 
print(classification_report(train.language, y_pred))

### Validate

In [None]:
y_pred_tfidf = lm_tfidf.predict(X_validate_tfidf)

In [None]:
# Compute the performance metrics for tfidf features.
print(classification_report(validate.language, y_pred_tfidf))

### Test

In [None]:
y_pred_tfidf = lm_tfidf.predict(X_test_tfidf)

In [None]:
# Compute the performance metrics for tfidf features.
print(classification_report(test.language, y_pred_tfidf))

## Decision Tree using TF-IDF Features

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tfidf_tree = TfidfVectorizer(stop_words=my_stop_words, min_df=5, 
                                     ngram_range=(1,2), 
                                     binary=True)

tfidf_sparse_matrix = tfidf_tree.fit_transform(train.clean)
X_tfidf = tfidf_sparse_matrix
tree = DecisionTreeClassifier(max_depth=3, random_state=123)
tree_fit = tree.fit(X_tfidf, train.language)
train['predicted_tfidf_tree'] = tree_fit.predict(X_tfidf)

In [None]:
train.head()

In [None]:

# print('Decision Tree TF-IDF Accuracy:',accuracy_score(train.language, train.predicted_tfidf).round(2)*100,'%','\n')
print(classification_report(train.language, train.predicted_tfidf_tree))
pd.crosstab(train.language, train.predicted_tfidf_tree)

#### Takeaway: 

- Decision Tree model using TD-IDF did not yield a higher accuracy result than the Logistic Regression model using Bag of Words or TD_IDF, thus we did not validate or test on this model. In future we would like to experiment with further adjustments to the hyperparameters to see if this model would yield better results.

## Validate

## Test

# Conclusions


### Model Evaluation

- 