# BERT on Repo Description

1. Construct a sentence corpus for each software type using labeled/manually validated repo descriptions
2. Calculate embeddingfor each corpus
3. Compare all repo description with each software type corpus using cosine-similarity score
    - It took about 1.5 hrs to run the embedding on repo data

Author: Cierra and Crystal

In [None]:
%reset

In [27]:
#pgadmin
import os
import psycopg2 as pg


#bert
from sentence_transformers import SentenceTransformer, util
import torch

import pandas as pd

import re

import nltk
nltk.download("punkt")

from nltk import tokenize

import scipy

import datetime

[nltk_data] Downloading package punkt to /home/dab3dj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Embedding Model

In [28]:
#embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2') #quicker model
embedder = SentenceTransformer('paraphrase-mpnet-base-v2') #most accurate, long run time

# Data

## I. Unlabelled Repo Data

In [29]:
repo_data = pd.read_csv("/home/zz3hs/git/dspg21oss/data/dspg21oss/clean_github_repos_157k.csv")

In [30]:
# get a list of repo descriptions
repo_description = repo_data["description"].tolist()

## II. Labelled Repo Data -- Software Type Corpus

In [31]:
python_data = pd.read_excel('~/git/dspg21oss/data/dspg21oss/labelled_repo/oss_software_labelled_python_sz.xlsx') #import csv
c_data = pd.read_excel('~/git/dspg21oss/data/dspg21oss/labelled_repo/oss_software_labelled_c_sz.xlsx')
java_data = pd.read_excel('~/git/dspg21oss/data/dspg21oss/labelled_repo/oss_software_labelled_java_sz.xlsx')

In [36]:
java_data.tail(10)

Unnamed: 0,slug,description,language,topics,commits,forks,stars,watchers,java_label,prog_java
336,pengrad/java-telegram-bot-api,telegram bot api for java,Java,telegram telegram-bot telegram-api bot telegra...,388.0,205.0,901.0,56.0,1.0,1.0
337,oldratlee/useful-scripts,üêå useful scripts for making developer's eve...,Shell,shell script option-parser java useful-scripts...,385.0,2354.0,5282.0,394.0,,1.0
338,qunarcorp/bistoury,bistouryÊòØÂéªÂì™ÂÑøÁΩëÁöÑjavaÂ∫îÁî®Áîü‰∫ßÈóÆÈ...,Java,java debug monitor agent jvm trouble-shooting ...,385.0,616.0,2978.0,137.0,,1.0
339,pauljamescleary/scala-pet-store,an implementation of the java pet store using ...,Scala,scala tagless-final reference circe http4s res...,376.0,187.0,850.0,39.0,,1.0
340,egzosn/pay-java-parent,Á¨¨‰∏âÊñπÊîØ‰ªòÂØπÊé•ÂÖ®ËÉΩÊîØ‰ªòjavaÂºÄÂèëÂ∑•...,Java,pay alipay wxpay youdian fuiou paypal payoneer,373.0,763.0,1935.0,127.0,,3.0
341,vipshop/vjtools,"the vip.com's java coding standard, libraries ...",Java,java java-library,372.0,1429.0,7096.0,504.0,,1.0
342,srs/gradle-node-plugin,gradle plugin for integrating nodejs in your b...,Groovy,,372.0,206.0,857.0,34.0,,1.0
343,pablojim/highcharts-ng,angularjs directive for highcharts,JavaScript,highcharts highcharts-ng angularjs charts java...,364.0,480.0,1742.0,94.0,0.0,1.0
344,thinkgem/jeesite,jeesite ÊòØ‰∏Ä‰∏™‰ºÅ‰∏ö‰ø°ÊÅØÂåñÂºÄÂèëÂü∫Á°ÄÂπ...,JavaScript,,357.0,5856.0,7670.0,1170.0,,2.0
345,,,,,,,,,50.0,


In [26]:
# software type
type_name =  "python_label"
# filter 500 validated repos that are labelled 1 (numeric)
corpus_type_i = python_data[python_data[type_name] ==1][["slug",type_name]]

# perform a left merge to get cleaned repo description
corpus_type_i = corpus_type_i.merge(repo_data, on='slug', how='left')
# get just the description
corpus_type_i = corpus_type_i["description"].tolist()

# Embedding 

In [15]:
# embedding for the corpus
corpus_type_i_embeddings = embedder.encode(corpus_type_i, show_progress_bar=True) # embeddings


Batches:   0%|          | 0/16 [00:00<?, ?it/s]

In [96]:

queries = repo_description

# pre-specified number of sentences
num_sentences = 10 #find 10 most similar sentences from the corpus

# init a result list for scores
result = []


t1 = datetime.datetime.now()
print("Start:", t1)

for query in queries: #compare each sentence in the abstract to the software type corpus
    #Compute embeddings
    query_embedding = embedder.encode(query, show_progress_bar=False, convert_to_tensor=True) 

    # We use cosine-similarity and torch.topk to find the highest k scores
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_type_i_embeddings)[0]

    top_results = torch.topk(cos_scores, k=num_sentences)   #get the top k scores
    result.append(top_results.values.tolist()) #unlist the top result list
   
    #print 10 most similar entences from the corpus and their corresponding scores
    #print("\n\n======================\n\n")
    #print("Query:", query)
    #print("Results:", top_results)
    #print("\nTop k=10 most similar sentences in corpus:")
    #for score, idx in zip(top_results[0], top_results[1]):
    #    print(corpus_type_i_clean[idx], "(Score: {:.4f})".format(score))

t2 =  datetime.datetime.now()
print("Finished", len(result), "descriptions at", t2)
print("It took", t2-t1, "to run.")

Start: 2021-07-17 15:15:06.160619
Finished 157538 descriptions at 2021-07-17 16:43:47.442722
It took 1:28:41.282103 to run.


In [97]:
#TODO: THIS WAY, similarity_score IS SAVED AS A STRING, NEED TO FIGURE OUT HOW TO SAVE AS A LIST
#save the similarity score as a variable of the original repo data
repo_data["similarity_score"] = result

In [98]:
#save csv
#repo_data.to_csv(r'/home/zz3hs/git/dspg21oss/data/dspg21oss/repo_data_python_score.csv', index = False)   


# Similarity Score Analysis

In [18]:
from scipy import stats
from scipy.stats import skew
import statistics #calculate mean and others

In [16]:
#read in data
repo_data = pd.read_csv(r'/home/zz3hs/git/dspg21oss/data/dspg21oss/repo_data_python_score.csv')   


In [17]:
repo_data

Unnamed: 0,slug,description,readme,language,topics,commits,forks,stars,watchers,similarity_score
0,vuejs/vue,"🖖 Vue.js is a progressive, incrementally-adopt...","<p align=""center""><a href=""https://vuejs.org"" ...",JavaScript,"['vue', 'javascript', 'frontend', 'framework']",3070.0,29611.0,185611.0,6250.0,"[0.42707720398902893, 0.41736093163490295, 0.3..."
1,facebook/react,"A declarative, efficient, and flexible JavaScr...",# [React](https://reactjs.org/) · [![GitHub li...,JavaScript,"['javascript', 'react', 'frontend', 'declarati...",12695.0,34352.0,171327.0,6718.0,"[0.5678911805152893, 0.5504274368286133, 0.549..."
2,tensorflow/tensorflow,An Open Source Machine Learning Framework for ...,"<div align=""center"">\n<img src=""https://www.te...",C++,"['tensorflow', 'machine-learning', 'python', '...",75671.0,84937.0,156754.0,8092.0,"[0.6914295554161072, 0.6374238133430481, 0.602..."
3,twbs/bootstrap,"The most popular HTML, CSS, and JavaScript fra...","<p align=""center"">\n<a href=""https://getbootst...",JavaScript,"['css', 'bootstrap', 'javascript', 'html', 'sc...",19228.0,73981.0,151778.0,7079.0,"[0.4793936014175415, 0.46719563007354736, 0.46..."
4,ohmyzsh/ohmyzsh,🙃 A delightful community-driven (with 1700+ c...,"<p align=""center""><img alt=""Oh My Zsh"" src=""ht...",Shell,"['shell', 'zsh-configuration', 'theme', 'termi...",5447.0,22232.0,129314.0,2678.0,"[0.4871608316898346, 0.44735753536224365, 0.44..."
...,...,...,...,...,...,...,...,...,...,...
157533,VeryLittleGravitas/CDTADPQ,Very Little Gravitas implementation of Prototy...,"# CA Alerts, made with Very Little Gravitas fo...",CSS,"['prototype', 'messaging', 'emergency', 'govte...",414.0,1.0,0.0,2.0,"[0.34419283270835876, 0.3314574360847473, 0.32..."
157534,dajinchu/kde-connect-android,For Google Code-In,,Java,[],414.0,0.0,0.0,1.0,"[0.525587260723114, 0.47061866521835327, 0.459..."
157535,LibrinnoTeam/LibraryHelpBot,Library Management System. ITP2 project,# Library Help Bot\r\n\r\n## Purpose of the ap...,Python,"['telegram-bot', 'mariadb', 'python', 'flask']",415.0,2.0,0.0,2.0,"[0.5185543894767761, 0.517078697681427, 0.4827..."
157536,Twissi/Animator,Animator for hacklace. See hacklace.org for fu...,Animator\n========\n\nAnimator for hacklace. S...,Java,[],415.0,0.0,0.0,0.0,"[0.41843315958976746, 0.4162086844444275, 0.40..."


In [29]:
#score is in a string, convert to a list, also make sure the numbers are float
score_ls = repo_data["similarity_score"]

score_ls_float = []
for sentence_score in score_ls:
    sentence_score = str(sentence_score)[1:-1]
    sentence_score = sentence_score.split(",")
    item_float= []
    for item in sentence_score:
        item_float.append(float(item))
    score_ls_float.append(item_float)

    
repo_data["similarity_score_float"] = score_ls_float

In [30]:
#check scores are in a list
repo_data["similarity_score_float"][0][0]

0.42707720398902893

In [31]:
# get score statistics
score_ls = repo_data["similarity_score_float"]

mean_score= []
range_score = []
max_score = []
median_score = []
skewness_score = []
for sentence_score in score_ls:
    mean_score.append(statistics.mean(sentence_score))
    range_score.append(max(sentence_score)- min(sentence_score))
    max_score.append(max(sentence_score))
    median_score.append(statistics.median(sentence_score))
    skewness_score.append(stats.skew(sentence_score))
repo_data["mean_score"]=mean_score
repo_data["range_score"]=range_score
repo_data["max_score"]=max_score
repo_data["median_score"]=median_score
repo_data["skewness_score"]=skewness_score

In [32]:
repo_data

Unnamed: 0,slug,description,readme,language,topics,commits,forks,stars,watchers,similarity_score,similarity_score_float,mean_score,range_score,max_score,median_score,skewness_score
0,vuejs/vue,"🖖 Vue.js is a progressive, incrementally-adopt...","<p align=""center""><a href=""https://vuejs.org"" ...",JavaScript,"['vue', 'javascript', 'frontend', 'framework']",3070.0,29611.0,185611.0,6250.0,"[0.42707720398902893, 0.41736093163490295, 0.3...","[0.42707720398902893, 0.41736093163490295, 0.3...",0.386763,0.065932,0.427077,0.381985,0.976499
1,facebook/react,"A declarative, efficient, and flexible JavaScr...",# [React](https://reactjs.org/) · [![GitHub li...,JavaScript,"['javascript', 'react', 'frontend', 'declarati...",12695.0,34352.0,171327.0,6718.0,"[0.5678911805152893, 0.5504274368286133, 0.549...","[0.5678911805152893, 0.5504274368286133, 0.549...",0.516635,0.074020,0.567891,0.503811,0.849749
2,tensorflow/tensorflow,An Open Source Machine Learning Framework for ...,"<div align=""center"">\n<img src=""https://www.te...",C++,"['tensorflow', 'machine-learning', 'python', '...",75671.0,84937.0,156754.0,8092.0,"[0.6914295554161072, 0.6374238133430481, 0.602...","[0.6914295554161072, 0.6374238133430481, 0.602...",0.603414,0.110041,0.691430,0.587026,1.810697
3,twbs/bootstrap,"The most popular HTML, CSS, and JavaScript fra...","<p align=""center"">\n<a href=""https://getbootst...",JavaScript,"['css', 'bootstrap', 'javascript', 'html', 'sc...",19228.0,73981.0,151778.0,7079.0,"[0.4793936014175415, 0.46719563007354736, 0.46...","[0.4793936014175415, 0.46719563007354736, 0.46...",0.424735,0.096729,0.479394,0.413202,0.336363
4,ohmyzsh/ohmyzsh,🙃 A delightful community-driven (with 1700+ c...,"<p align=""center""><img alt=""Oh My Zsh"" src=""ht...",Shell,"['shell', 'zsh-configuration', 'theme', 'termi...",5447.0,22232.0,129314.0,2678.0,"[0.4871608316898346, 0.44735753536224365, 0.44...","[0.4871608316898346, 0.44735753536224365, 0.44...",0.432683,0.075190,0.487161,0.425684,1.349496
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157533,VeryLittleGravitas/CDTADPQ,Very Little Gravitas implementation of Prototy...,"# CA Alerts, made with Very Little Gravitas fo...",CSS,"['prototype', 'messaging', 'emergency', 'govte...",414.0,1.0,0.0,2.0,"[0.34419283270835876, 0.3314574360847473, 0.32...","[0.34419283270835876, 0.3314574360847473, 0.32...",0.317462,0.045936,0.344193,0.318741,0.451481
157534,dajinchu/kde-connect-android,For Google Code-In,,Java,[],414.0,0.0,0.0,1.0,"[0.525587260723114, 0.47061866521835327, 0.459...","[0.525587260723114, 0.47061866521835327, 0.459...",0.366771,0.236830,0.525587,0.312547,0.700486
157535,LibrinnoTeam/LibraryHelpBot,Library Management System. ITP2 project,# Library Help Bot\r\n\r\n## Purpose of the ap...,Python,"['telegram-bot', 'mariadb', 'python', 'flask']",415.0,2.0,0.0,2.0,"[0.5185543894767761, 0.517078697681427, 0.4827...","[0.5185543894767761, 0.517078697681427, 0.4827...",0.471379,0.077404,0.518554,0.467471,0.701968
157536,Twissi/Animator,Animator for hacklace. See hacklace.org for fu...,Animator\n========\n\nAnimator for hacklace. S...,Java,[],415.0,0.0,0.0,0.0,"[0.41843315958976746, 0.4162086844444275, 0.40...","[0.41843315958976746, 0.4162086844444275, 0.40...",0.400396,0.032988,0.418433,0.397456,0.503637
