<a href="https://colab.research.google.com/github/CodePayCloud/model/blob/main/codepay_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CodePay Base Model


## Base Model Inputs

*   User Inputs
*   User Upload
*   Cloud Storage



In [None]:
import requests
import json
from datetime import datetime, timedelta
from google.colab import userdata

# GitHub settings
repo = 'npm/cli'
token = userdata.get('GH_TOKEN')  # Fetching the GitHub token

# Calculate the date one year ago from today
one_year_ago = datetime.now() - timedelta(days=365)
since_date = one_year_ago.strftime('%Y-%m-%d')

# GitHub API URL for pull requests in the past year
url = f'https://api.github.com/repos/{repo}/pulls?state=all&since={since_date}'

pull_requests_data = []

while True:
    response = requests.get(url, headers={'Authorization': f'token {token}'})
    if response.status_code != 200:
        print(f"Error: {response.status_code}")
        break

    pull_requests = response.json()
    for pr in pull_requests:
        pr_data = {
            'number': pr['number'],
            'title': pr['title'],
            'body': pr['body'],
            'state': pr['state']
        }
        pull_requests_data.append(pr_data)

    if 'next' in response.links:
        url = response.links['next']['url']
    else:
        break

# Saving data to a JSON file
with open('npm_cli_pull_requests.json', 'w') as file:
    json.dump(pull_requests_data, file)

print("Data stored in npm_cli_pull_requests.json")

Data stored in npm_cli_pull_requests.json


## Base Model Stack

*   Python
*   TF/Keras



## **`Step 1:`** Installs

In [None]:
import os
import subprocess
import pkg_resources

# Function to check if a package is installed
def is_package_installed(package_name):
    try:
        pkg_resources.get_distribution(package_name)
        return True
    except pkg_resources.DistributionNotFound:
        return False

# Check if SpaCy is installed, and install if not
if not is_package_installed('spacy'):
    subprocess.run(['pip', 'install', 'spacy'])
    subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_sm'])

!pip install -r requirements.txt

## **Step 2:** Import Libraries & Label PRs

> PRs labeled by Bert



In [None]:
from transformers import pipeline
import numpy as np

# Now you can create the classifier
classifier = pipeline("text-classification", model="bert-base-uncased")

# Load data from JSON file
with open('npm_cli_pull_requests.json', 'r') as file:
    pr_data = json.load(file)

# Initialize lists to store texts, predicted labels, and weights
texts = []
predicted_labels = []
weights = []

# Create prompts and classify for the first 10 rows
for pr in pr_data[:10]:
    if pr['body'] is not None:
        prompt = f"Given the following title and description, would you classify this PR as a feature or bug?\nDescription: {pr['body']}"
    else:
        prompt = f"Given the following title, would you classify this PR as a feature or bug?\nTitle: {pr['title']}"

    # Use the classifier to predict the label ('feature' or 'bug') for the prompt
    prediction = classifier(prompt)

    # Extract the predicted label from the classifier's output
    predicted_label = prediction[0]['label']

    # Calculate the description length-based weight and its logarithm
    description_length = len(pr['body']) if pr['body'] else 0
    weight = 0.5 + (description_length / 1000)  # Adjust the scaling factor as needed
    log_weight = np.log(weight)

    # Append the text, predicted label, and log_weight to the respective lists
    texts.append(prompt)
    predicted_labels.append(predicted_label)
    weights.append(log_weight)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## **Step 2a:** Label Classification of Bert Inspection

In [None]:
import pandas as pd
df = pd.DataFrame({'Text': texts, 'Classification': predicted_labels,'PR_Importance_Weights':weights})
df['Classification'] = df['Classification'].replace({'LABEL_0': 'bug', 'LABEL_1': 'feature'})
df.to_csv('classification_results.csv', index=False)
df

## **Step 2b:** K Means on Texts with Tensorflow

In [None]:
import tensorflow_hub as hub
from sklearn.cluster import KMeans

# Load a pre-trained text embedding model from TensorFlow Hub
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Encode your text data into embeddings
embeddings = embed(texts)

# Convert embeddings to a NumPy array
embeddings = np.array(embeddings)

# Apply K-Means clustering
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(embeddings)

# Get cluster labels
cluster_labels = kmeans.labels_
df['Cluster'] = cluster_labels
df.to_csv('classification_results.csv', index=False)
df



Unnamed: 0,Text,Classification,Weights,Cluster
0,"Given the following title and description, wou...",bug,-0.328504,1
1,"Given the following title and description, wou...",bug,-0.142716,1
2,"Given the following title and description, wou...",bug,-0.328504,1
3,"Given the following title and description, wou...",bug,-0.328504,1
4,"Given the following title and description, wou...",bug,-0.659712,0
5,"Given the following title and description, wou...",bug,-0.328504,1
6,"Given the following title and description, wou...",bug,-0.555126,0
7,"Given the following title, would you classify ...",bug,-0.693147,0
8,"Given the following title and description, wou...",bug,-0.671386,0
9,"Given the following title and description, wou...",bug,-0.642454,0


# **Step A for CNN** Setup TPU

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])
tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
strategy = tf.distribute.experimental.TPUStrategy(resolver)



## **Step B for CNN:** Build CNN with Keras

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense

def build_model():
    model = Sequential()
    model.add(Embedding(10000, 128, input_length=200))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(MaxPooling1D(5))
    model.add(Conv1D(128, 5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(2, activation='softmax'))  # 2 for 'bug' and 'feature'
    return model


## **Step C for CNN:** Compile CNN

In [None]:
with strategy.scope():
    model = build_model()
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

## **Step D for CNN:** Train Model

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=128, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## **Step E for CNN:** Test Model

In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test)
predictions = model.predict(X_test)
predicted_labels = predictions.argmax(axis=1)
print(f"Test accuracy: {test_accuracy * 100:.2f}%, Test loss: {test_loss}")

Test accuracy: 92.91%, Test loss: 0.35124388337135315


## **Step F for CNN:** Analysis & Visualization

In [None]:
from sklearn.metrics import classification_report
true_labels = y_test.argmax(axis=1)
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.94      0.99      0.96       407
           1       0.44      0.13      0.21        30

    accuracy                           0.93       437
   macro avg       0.69      0.56      0.58       437
weighted avg       0.91      0.93      0.91       437

