In [1]:
import numpy as np
import pandas as pd

# acquire
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

# prepare
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# explore
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# model
from sklearn.linear_model import LogisticRegression
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report



In [2]:
import acquire
import prepare
import wrangle
import github_acquire

credentials loaded successfully
Wrangle functions loaded successfully.
git acquire module loaded successsfully


In [5]:
df = pd.read_json('repos.json')
df.head()

Unnamed: 0,language,repo,content
0,JavaScript,facebook/react,React · \nReact is a JavaScript library for...
1,JavaScript,d3/d3,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...
2,JavaScript,vuejs/vue,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...
3,JavaScript,axios/axios,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...
4,JavaScript,facebook/create-react-app,Create React App \n\nCreate React apps with n...


In [6]:
len(df.language[df_raw.language.isnull()])

0

In [None]:
# what's the proportion of each language in our data
labels = pd.concat([train.language.value_counts(), # get total counts of each language
                    train.language.value_counts(normalize=True)], axis=1) # getting the prop of each language
labels.columns = ['n', 'percent']
labels

In [7]:
df = pd.read_json('repos_clean.json')
df.head()

Unnamed: 0,language,repo,content,stemmed,lemmatized,clean
0,JavaScript,facebook/react,React · \nReact is a JavaScript library for...,react react is a javascript librari for build ...,react react is a javascript library for buildi...,react react javascript library building user i...
1,JavaScript,d3/d3,D3: Data-Driven Documents\n\nD3 (or D3.js) is ...,d3 datadriven document d3 or d3j is a javascri...,d3 datadriven document d3 or d3js is a javascr...,d3 datadriven document d3 d3js javascript libr...
2,JavaScript,vuejs/vue,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...,support vuej vuej is an mitlicens open sourc p...,supporting vuejs vuejs is an mitlicensed open ...,supporting vuejs vuejs mitlicensed open source...
3,JavaScript,axios/axios,axios\n\n\n\n\n\n\n\n\nPromise based HTTP clie...,axio promis base http client for the browser a...,axios promise based http client for the browse...,axios promise based http client browser nodejs...
4,JavaScript,facebook/create-react-app,Create React App \n\nCreate React apps with n...,creat react app creat react app with no build ...,create react app create react apps with no bui...,create react app create react apps build confi...


In [8]:
df.sample(25)

Unnamed: 0,language,repo,content,stemmed,lemmatized,clean
204,Python,pypa/pipenv,Pipenv: Python Development Workflow for Humans...,pipenv python develop workflow for human depen...,pipenv python development workflow for human d...,pipenv python development workflow human depen...
49,JavaScript,babel/babel,\n\n\n\n\n\n The compiler for writing next ge...,the compil for write next gener javascript sup...,the compiler for writing next generation javas...,compiler writing next generation javascript su...
253,Python,openai/gpt-2,"Status: Archive (code is provided as-is, no up...",statu archiv code is provid asi no updat expec...,status archive code is provided asis no update...,status archive code provided asis update expec...
510,Python,darknessomi/musicbox,NetEase-MusicBox\n感谢为 MusicBox 的开发付出过努力的每一个人！\...,neteasemusicbox musicbox python demo 320kbp 22...,neteasemusicbox musicbox python demo 320kbps 2...,neteasemusicbox musicbox python demo 320kbps 2...
258,Python,spotify/luigi,"\n\n\n\n\n\n\n\nLuigi is a Python (3.6, 3.7 te...",luigi is a python 36 37 test packag that help ...,luigi is a python 36 37 tested package that he...,luigi python 36 37 tested package help build c...
67,JavaScript,alvarotrigo/fullPage.js,fullPage.js\n\n\n\nEnglish |\n Español |\n F...,fullpagej english espaol franai p avail for vu...,fullpagejs english espaol franais p available ...,fullpagejs english espaol franais p &#9; avail...
151,Python,pallets/flask,Flask\nFlask is a lightweight WSGI web applica...,flask flask is a lightweight wsgi web applic f...,flask flask is a lightweight wsgi web applicat...,flask flask lightweight wsgi web application f...
89,JavaScript,NervJS/taro,Taro\n\n\n\n\n\n\n\n👽 Taro['tɑ:roʊ]，泰罗·奥特曼，宇宙警...,taro tarotro taro reactvuenerv qq h5 webreact ...,taro tarotro taro reactvuenerv qq h5 webreact ...,taro tarotro taro reactvuenerv qq h5 webreact ...
346,JavaScript,necolas/react-native-web,React Native for Web\n \nCompatibility: React...,react nativ for web compat react nativ 063 rea...,react native for web compatibility react nativ...,react native web compatibility react native 06...
175,Python,apache/incubator-superset,"\nSuperset\n\n\n\n\n\n\n\n\n\n\nA modern, ente...",superset a modern enterprisereadi busi intelli...,superset a modern enterpriseready business int...,superset modern enterpriseready business intel...


In [None]:
# # add a column that is a list of each word for each repo --> PREPARE.PY
# words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df.clean] 

# # column name will be words, and the column will contain lists of the words in each doc
# df = pd.concat([df, pd.DataFrame({'words': words})], axis=1)
# df.head()

### Splitting the data

In [9]:
# split the data using train_test_split, doing it twice so that we have 
from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(df[['language', 'lemmatized', 'clean']], 
                                        stratify=df.language, 
                                        test_size=.2, random_state = 123)

train, validate = train_test_split(train_validate, 
                                   stratify=train_validate.language, 
                                   test_size=.25, random_state = 123)

In [10]:
train.shape, validate.shape, test.shape

((348, 3), (116, 3), (117, 3))

In [11]:
train_labels = pd.concat([train.language.value_counts(), # get total counts of ham vs spam
                    train.language.value_counts(normalize=True)], axis=1) # getting the prop of ham vs. spam

train_labels.columns = ['n', 'percent']
train_labels

Unnamed: 0,n,percent
JavaScript,180,0.517241
Python,168,0.482759


#### Bag of Words Setup

In [12]:
# Create CountVectorizer, which create bag-of-words model.
# stop_words : Specify language to remove stopwords. 
# min_df: ignore terms that have a document frequency strictly 
# lower than the given threshold. This value is also called cut-off in the literature. 
# If float, the parameter represents a proportion of documents, integer absolute counts. 
# ngram_range: the lower and upper boundary of the range of n-values for 
# different word n-grams or char n-grams to be extracted. 

vectorizer = CountVectorizer(stop_words='english', 
                             min_df=20, 
                             ngram_range=(1,2), 
                             binary=True)

# Learn vocabulary in sentences. 
vectorizer.fit(train.clean)

# Get dictionary. 
vectorizer.get_feature_names()

['10',
 '100',
 '1000',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '200',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '22',
 '23',
 '26',
 '27',
 '2x',
 '30',
 '31',
 '32',
 '34',
 '35',
 '36',
 '37',
 '40',
 '50',
 '500',
 'ability',
 'able',
 'accept',
 'accepts',
 'access',
 'accessible',
 'according',
 'account',
 'action',
 'active',
 'activity',
 'actual',
 'actually',
 'add',
 'added',
 'adding',
 'addition',
 'additional',
 'address',
 'advanced',
 'ai',
 'aim',
 'algorithm',
 'alias',
 'allow',
 'allowing',
 'allows',
 'alternative',
 'alternatively',
 'amazon',
 'analysis',
 'andor',
 'android',
 'angular',
 'animation',
 'answer',
 'apache',
 'api',
 'api reference',
 'apis',
 'app',
 'application',
 'applied',
 'apply',
 'approach',
 'appropriate',
 'apps',
 'arbitrary',
 'architecture',
 'archive',
 'area',
 'arent',
 'argument',
 'array',
 'article',
 'ask',
 'aspect',
 'asset',
 'associated',
 'async',
 'asynchronous',
 'attempt',
 

In [13]:
# Transform each sentences in vector space.
bow = vectorizer.transform(train.clean)

# this is just to see the array of 0's and 1's
bow_array = bow.toarray()
bow_array[0]

array([0, 0, 0, ..., 1, 0, 0])

#### TD-IDF Setup

In [14]:
tfidf = TfidfVectorizer(stop_words='english', min_df=20, 
                             ngram_range=(1,2), 
                             binary=True)

tfidf_sparse_matrix = tfidf.fit_transform(train.clean)
tfidf_sparse_matrix

<348x1192 sparse matrix of type '<class 'numpy.float64'>'
	with 57815 stored elements in Compressed Sparse Row format>

In [15]:
pd.DataFrame(tfidf_sparse_matrix.todense(), columns=tfidf.get_feature_names()).head()

Unnamed: 0,10,100,1000,11,12,13,14,15,16,17,...,youd,youd like,youll,youll need,youre,youre using,youtube,youve,zero,zip
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12303,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.098509,0.0,0.0
2,0.078485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.080619,0.0,0.0,0.0,0.0,0.0
3,0.0,0.056976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071346
4,0.0,0.0,0.0,0.110354,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.097865,0.0,0.0,0.0,0.134363,0.0,0.0,0.0


In [16]:
# Get vocabularies.
tfidf.vocabulary_

{'welcome': 1160,
 'way': 1155,
 'build': 144,
 'share': 972,
 'data': 275,
 'apps': 81,
 'let': 614,
 'turn': 1103,
 'script': 944,
 'web': 1156,
 'minute': 700,
 'python': 861,
 'opensource': 751,
 'free': 460,
 'youve': 1189,
 'created': 264,
 'app': 75,
 'use': 1122,
 'sharing': 974,
 'platform': 802,
 'deploy': 298,
 'manage': 672,
 'world': 1172,
 'installation': 559,
 'pip': 796,
 'install': 556,
 'hello': 507,
 'installed': 561,
 'virtual': 1144,
 'environment': 377,
 'window': 1165,
 'mac': 657,
 'linux': 633,
 'little': 636,
 'example': 388,
 'make': 669,
 'easy': 353,
 'interactive': 571,
 'import': 538,
 'value': 1134,
 'simple': 981,
 'focused': 443,
 'api': 72,
 'rich': 927,
 'powerful': 815,
 'tool': 1085,
 'demo': 294,
 'project': 841,
 'entire': 373,
 'dataset': 277,
 'run': 933,
 'realtime': 884,
 'using': 1128,
 'object': 740,
 'detection': 308,
 'net': 720,
 'complete': 210,
 'implemented': 537,
 'le': 607,
 'line': 628,
 'fact': 416,
 'contains': 237,
 '23': 21,
 '

#### Creating the X_train and y_train variables for modeling:

In [17]:
# Assigning the target:
y = train['language']

# Assigning the Features:
X_bow = bow

X_tfidf = tfidf_sparse_matrix

In [18]:
# Predicting based on BoW:

from sklearn.linear_model import LogisticRegression

lm = LogisticRegression().fit(X_bow, y)

train['predicted'] = lm.predict(X_bow)
train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,language,lemmatized,clean,predicted
277,Python,welcome to streamlit the fastest way to build ...,welcome streamlit fastest way build share data...,Python
97,JavaScript,translation espaol deutsch portugus trke add f...,translation espaol deutsch portugus trke add f...,JavaScript
519,Python,plotlypy latest release user forum pypi downlo...,plotlypy latest release user forum pypi downlo...,Python
203,Python,datascienceipythonnotebooks index deeplearning...,datascienceipythonnotebooks index deeplearning...,Python
582,Python,mlcourseai open machine learning course mlcour...,mlcourseai open machine learning course mlcour...,Python


In [19]:
confusion_matrix(train.language, train.predicted)
pd.crosstab(train.language, train.predicted)

predicted,JavaScript,Python
language,Unnamed: 1_level_1,Unnamed: 2_level_1
JavaScript,180,0
Python,0,168


In [20]:
print(classification_report(train.language, train.predicted))

              precision    recall  f1-score   support

  JavaScript       1.00      1.00      1.00       180
      Python       1.00      1.00      1.00       168

    accuracy                           1.00       348
   macro avg       1.00      1.00      1.00       348
weighted avg       1.00      1.00      1.00       348



In [21]:
lm_tfidf = LogisticRegression().fit(X_tfidf, y)
train['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
print(classification_report(train.language, train.pred_tfidf))

              precision    recall  f1-score   support

  JavaScript       0.99      0.99      0.99       180
      Python       0.99      0.99      0.99       168

    accuracy                           0.99       348
   macro avg       0.99      0.99      0.99       348
weighted avg       0.99      0.99      0.99       348

