# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import requests
from requests import get
from os import path
from bs4 import BeautifulSoup
import os
import re

import json
import unicodedata

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

from typing import Dict, List, Optional, Union, cast
from env import github_token, github_username
from wordcloud import WordCloud

import acquire as ac
import prepare

# Acquire

#### For this project, you will have to build a dataset yourself. Decide on a list of GitHub repositories to scrape, and write the python code necessary to extract the text of the README file for each page, and the primary language of the repository.

#### Which repositories you use are up to you, but you should include at least 100 repositories in your data set.

In [4]:
url = 'https://github.com/search?o=desc&p={num}&q=OpenCV&s=stars&type=Repositories'
response = requests.get(url)

In [5]:
urls = ac.get_all_urls() 

In [6]:
urls

['opencv/opencv',
 'openframeworks/openFrameworks',
 'Ewenwan/MVision',
 'oarriaga/face_classification',
 'CMU-Perceptual-Computing-Lab/openpose',
 'PySimpleGUI/PySimpleGUI',
 'vipstone/faceai',
 'opencv/opencv_contrib',
 'Hironsan/BossSensor',
 'spmallick/learnopencv',
 'jrosebr1/imutils',
 'hamuchiwa/AutoRCCar',
 'peterbraden/node-opencv',
 'justadudewhohacks/opencv4nodejs',
 'bytedeco/javacv',
 'kelaberetiv/TagUI',
 'nuno-faria/tiler',
 'esimov/pigo',
 'bijection/sistine',
 'hybridgroup/gocv',
 'CodecWang/OpenCV-Python-Tutorial',
 'makelove/OpenCV-Python-Tutorial',
 'Roujack/mathAI',
 'amusi/AI-Job-Notes',
 'MasteringOpenCV/code',
 'HuTianQi/SmartOpenCV',
 'mapillary/OpenSfM',
 'anandpawara/Real_Time_Image_Animation',
 'shimat/opencvsharp',
 'soruly/trace.moe',
 'changwookjun/StudyBook',
 'opentrack/opentrack',
 'oreillymedia/Learning-OpenCV-3_examples',
 'skvark/opencv-python',
 'tebelorg/RPA-Python',
 'andrewssobral/bgslibrary',
 'kongqw/OpenCVForAndroid',
 'ivanseidel/Is-Now-Ille

In [11]:
df = pd.read_json('data.json')

In [12]:
# 150 repos
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   repo             150 non-null    object
 1   language         144 non-null    object
 2   readme_contents  150 non-null    object
dtypes: object(3)
memory usage: 3.6+ KB


In [13]:
df

Unnamed: 0,repo,language,readme_contents
0,Ewenwan/MVision,C++,# MVision　Machine Vision 机器视觉\n[AI算法工程师手册 数学基础...
1,opencv/opencv_contrib,C++,## Repository for OpenCV's extra modules\n\nTh...
2,PySimpleGUI/PySimpleGUI,Python,"\n<p align=""center"">\n <img src=""https://raw...."
3,opencv/opencv,C++,## OpenCV: Open Source Computer Vision Library...
4,CMU-Perceptual-Computing-Lab/openpose,C++,"<div align=""center"">\n <img src="".github/Lo..."
...,...,...,...
145,jerry1900/faceRecognition,Python,# faceRecognition\n利用OpenCV、CNN进行人脸识别\n万壑，4978...
146,lixiaoshaxing/MultiMediaLearn,C,# MultiMediaLearn\n多媒体学习：图片处理，音视频处理，相机使用，OpenG...
147,WL-Amigo/waifu2x-converter-cpp,C++,# waifu2x (converter only version)\n\nThis is ...
148,atulapra/Emotion-detection,Python,# Emotion detection using deep learning\n\n## ...


# Prepare

In [16]:
language_count = pd.concat([df.language.value_counts(),
                    df.language.value_counts(normalize=True)], axis=1)
language_count.columns = ['n', 'percent']
language_count

Unnamed: 0,n,percent
Python,61,0.423611
C++,47,0.326389
Java,7,0.048611
JavaScript,6,0.041667
Jupyter Notebook,5,0.034722
Go,3,0.020833
Shell,2,0.013889
C#,2,0.013889
Objective-C,2,0.013889
Perl,1,0.006944


In [19]:
df['text_cleaned'] = df.readme_contents.apply(prepare.basic_clean)
df['text_tokenized'] = df.text_cleaned.apply(prepare.tokenize)
df['text_lemmatized'] = df.text_tokenized.apply(prepare.lemmatize)
df['text_filtered'] = df.text_lemmatized.apply(prepare.remove_stopwords)
df.head()

Unnamed: 0,repo,language,readme_contents,text_cleaned,text_tokenized,text_lemmatized,text_filtered
0,Ewenwan/MVision,C++,# MVision　Machine Vision 机器视觉\n[AI算法工程师手册 数学基础...,mvision machine vision \nai httpwwwhuaxia...,mvision machine vision \nai httpwwwhuaxiaozhua...,mvision machine vision ai httpwwwhuaxiaozhuanc...,mvision machine vision ai httpwwwhuaxiaozhuanc...
1,opencv/opencv_contrib,C++,## Repository for OpenCV's extra modules\n\nTh...,repository for opencvs extra modules\n\nthis ...,repository for opencvs extra modules\n\nthis r...,repository for opencvs extra module this repos...,repository opencvs extra module repository int...
2,PySimpleGUI/PySimpleGUI,Python,"\n<p align=""center"">\n <img src=""https://raw....",\np aligncenter\n img srchttpsrawgithubuserco...,p aligncenter\n img srchttpsrawgithubuserconte...,p aligncenter img srchttpsrawgithubusercontent...,p aligncenter img srchttpsrawgithubusercontent...
3,opencv/opencv,C++,## OpenCV: Open Source Computer Vision Library...,opencv open source computer vision library\n\...,opencv open source computer vision library\n\n...,opencv open source computer vision library res...,opencv open source computer vision library res...
4,CMU-Perceptual-Computing-Lab/openpose,C++,"<div align=""center"">\n <img src="".github/Lo...",div aligncenter\n img srcgithublogo_main_bl...,div aligncenter\n img srcgithublogo_main_black...,div aligncenter img srcgithublogo_main_blackpn...,div aligncenter img srcgithublogo_main_blackpn...


# Explore

#### Explore the data that you have scraped. Here are some ideas for exploration:

- What are the most common words in READMEs?   


- What does the distribution of IDFs look like for the most common words?  


- Does the length of the README vary by programming language?    


- Do different programming languages use a different number of unique words?

# Model

- Transform your documents into a form that can be used in a machine learning model. You should use the programming language of the repository as the label to predict.  


- Try fitting several different models and using several different representations of the text (e.g. a simple bag of words, then also the TF-IDF values for each).   


- Build a function that will take in the text of a README file, and tries to predict the programming language.  