1. Data Preprocessing

Load and Clean Data:

In [26]:
import pandas as pd
import re
import string
import emoji
from datetime import datetime
import networkx as nx

# Load data
data = pd.read_csv('issue_data_sna.csv')

# Parse dates and calculate issue lifetime in hours
data['Open Date'] = pd.to_datetime(data['Open Date'])
data['Closed Date'] = pd.to_datetime(data['Closed Date'])
data['Issue Lifetime (hours)'] = (data['Closed Date'] - data['Open Date']).dt.total_seconds() / 3600

# Bin issue lifetimes
bins = [0, 48, 336, float('inf')]  # <2 days, 2 days < x < 2 weeks, >2 weeks
labels = ['< 2 days', '2 days < x < 2 weeks', '> 2 weeks']
data['Issue Lifetime'] = pd.cut(data['Issue Lifetime (hours)'], bins=bins, labels=labels, right=False)

# Clean text function
def clean_text(text):
    if not isinstance(text, str):
        return ''
    
    text = text.replace('"', '')  # Remove double quotes
    text = re.sub(r'DevTools.*?\(automated\)', '', text)  # Remove specific text
    text = text.lower()  # Lowercase
    text = emoji.demojize(text)  # Remove emojis
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)  # Remove punctuation
    text = text.replace("#", "").replace("\n", "").replace("\r", "")  # Remove unwanted characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    words = text.split()
    words = [word for word in words if len(word) <= 20]  # Remove long words
    return ' '.join(words)

# Clean the text columns
data['Issue Title'] = data['Issue Title'].apply(clean_text)
data['Issue Body'] = data['Issue Body'].apply(clean_text)
data['Comment Bodies'] = data['Comment Bodies'].apply(lambda x: clean_text(str(x)))
data['Comment Authors'] = data['Comment Authors'].apply(lambda x: str(x))

# Merge text columns for processing
data['Issue Text'] = data['Issue Title'] + ' ' + data['Issue Body'] + ' ' + data['Comment Bodies']

2. Feature Engineering
Compute Social Network Metrics:

In [27]:

# Function to get social network metrics
def get_social_network_metrics(comment_body_list, comment_author_list):
    if len(comment_author_list) <= 1:
        return 0, 0, 0, 0, 0, 0, 1 if len(comment_author_list) == 1 else 0

    G = nx.DiGraph()
    for i in range(len(comment_author_list) - 1):
        G.add_edge(comment_author_list[i + 1], comment_author_list[i])

    if G.number_of_edges() == 0:
        return 0, 0, 0, 0, 0, 0, len(set(comment_author_list))

    wordiness = sum(len(s) for s in comment_body_list) // len(comment_body_list) if comment_body_list else 0
    centrality = nx.degree_centrality(G)
    average_degree_centrality = sum(centrality.values()) / len(centrality) if centrality else 0
    closeness = nx.closeness_centrality(G)
    average_closeness = sum(closeness.values()) / len(closeness) if closeness else 0
    betweenness = nx.betweenness_centrality(G)
    average_betweenness = sum(betweenness.values()) / len(betweenness) if betweenness else 0
    density = nx.density(G)
    edges = nx.number_of_edges(G)
    num_discussants = len(set(comment_author_list))
    
    return wordiness, average_degree_centrality, average_closeness, average_betweenness, density, edges, num_discussants

# Apply social network metrics to each row
data[['wordiness', 'average_degree_centrality', 'average_closeness', 
      'average_betweenness', 'density', 'edges', 'num_discussants']] = data.apply(
    lambda row: get_social_network_metrics(row['Comment Bodies'].split('|SEPARATOR|'), row['Comment Authors'].split('|SEPARATOR|')), axis=1, result_type='expand'
)


3. Model Training  
Prepare Data and Fine-Tune GPT Model:

In [31]:
import openai
import json

# Read the OpenAI API key from a file
with open('openAiKey.txt', 'r') as file:
    api_key = file.read().strip()

# Initialize the OpenAI client
client = openai.OpenAI(api_key=api_key)

# Function to prepare data for GPT fine-tuning
def prepare_data_for_gpt(df):
    training_data = []
    for _, row in df.iterrows():
        prompt = (
            f"Predict the time to solve this GitHub issue based on its content:\n\n"
            f"Title: {row['Issue Title']}\n"
            f"Body: {row['Issue Body']}\n"
            f"Labels: {row['Labels']}\n"
            f"Number of Comments: {row['Number of Comments']}\n"
            f"Comment Bodies: {row['Comment Bodies']}\n"
            f"Social Metrics: wordiness={row['wordiness']}, average_degree_centrality={row['average_degree_centrality']}, "
            f"average_closeness={row['average_closeness']}, average_betweenness={row['average_betweenness']}, "
            f"density={row['density']}, edges={row['edges']}, num_discussants={row['num_discussants']}\n"
            f"\nResponse:"
        )
        completion = row['Issue Lifetime']
        training_data.append({'prompt': prompt, 'completion': f" {completion}"})
    return training_data

# Prepare the data
training_data = prepare_data_for_gpt(data)

# Save as JSONL file
with open('training_data.jsonl', 'w') as f:
    for entry in training_data:
        json.dump(entry, f)
        f.write('\n')

# Upload the training file
upload_response = client.files.create(
    file=open('training_data.jsonl', 'rb'),
    purpose='fine-tune'
)

training_file_id = upload_response.id
print(f"Training File ID: {training_file_id}")

# Create a fine-tune job
fine_tune_response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo"
)

# Print response
print(fine_tune_response.model_dump_json(indent=2))


Training File ID: file-hu9XpjMVo5onPj18LGGIQMUp
{
  "id": "ftjob-9O0pKBiwri1NFq79I5xAs0QQ",
  "created_at": 1717747828,
  "error": {
    "code": null,
    "message": null,
    "param": null
  },
  "fine_tuned_model": null,
  "finished_at": null,
  "hyperparameters": {
    "n_epochs": "auto",
    "batch_size": "auto",
    "learning_rate_multiplier": "auto"
  },
  "model": "gpt-3.5-turbo-0125",
  "object": "fine_tuning.job",
  "organization_id": "org-iXy8zWZJPm0q9LU8ZICnDE47",
  "result_files": [],
  "status": "validating_files",
  "trained_tokens": null,
  "training_file": "file-hu9XpjMVo5onPj18LGGIQMUp",
  "validation_file": null,
  "user_provided_suffix": null,
  "seed": 1324902560,
  "estimated_finish": null,
  "integrations": []
}


4. Monitoring the Fine-Tuning Process
To monitor the fine-tuning job, we can periodically check its status until it completes. Once the job is complete, we can retrieve details about the fine-tuned model.

Monitoring and Retrieving the Fine-Tuned Model

In [32]:
import json

def validate_jsonl_file(file_path):
    with open(file_path, 'r') as f:
        for line_number, line in enumerate(f, start=1):
            try:
                record = json.loads(line)
                if 'prompt' not in record or 'completion' not in record:
                    print(f"Line {line_number}: Missing 'prompt' or 'completion' field.")
                if not isinstance(record['prompt'], str) or not isinstance(record['completion'], str):
                    print(f"Line {line_number}: 'prompt' and 'completion' should be strings.")
            except json.JSONDecodeError as e:
                print(f"Line {line_number}: JSONDecodeError - {e}")

# Validate the training data file
validate_jsonl_file('training_data.jsonl')


In [33]:
import time

# Function to check the status of the fine-tuning job
def check_fine_tune_status(job_id):
    status_response = client.fine_tuning.jobs.retrieve(job_id)
    return status_response

# Fine-tuning job ID
fine_tune_job_id = fine_tune_response.id
print(f"Fine-tuning Job ID: {fine_tune_job_id}")

# Monitor the fine-tuning job status
status = check_fine_tune_status(fine_tune_job_id)
print(f"Initial Status: {status.status}")

# Poll the status until the job is complete
while status.status not in ['succeeded', 'failed']:
    time.sleep(60)  # Wait for 60 seconds before checking the status again
    status = check_fine_tune_status(fine_tune_job_id)
    print(f"Current Status: {status.status}")

# Check the final status and retrieve model details if successful
if status.status == 'succeeded':
    fine_tuned_model = status.fine_tuned_model
    print(f"Fine-tuning succeeded. Fine-tuned Model: {fine_tuned_model}")
else:
    print("Fine-tuning failed. Please check the details and logs for more information.")


Fine-tuning Job ID: ftjob-9O0pKBiwri1NFq79I5xAs0QQ
Initial Status: failed
Fine-tuning failed. Please check the details and logs for more information.


In [39]:
# Function to retrieve fine-tuning job logs
def get_fine_tune_logs(job_id):
    logs_response = client.fine_tuning.jobs.list_events(job_id)
    return logs_response

# Retrieve the logs for the failed fine-tuning job
logs = get_fine_tune_logs(fine_tune_job_id)

# Print the logs
for log in logs.data:
    print(f"{log.created_at}: {log.message}")



1717747840: The job failed due to an invalid training file. Invalid file format. Input file file-hu9XpjMVo5onPj18LGGIQMUp is in the prompt-completion format, but the specified model gpt-3.5-turbo-0125 is a chat model and requires chat-formatted data. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for details.
1717747828: Validating training file: file-hu9XpjMVo5onPj18LGGIQMUp
1717747828: Created fine-tuning job: ftjob-9O0pKBiwri1NFq79I5xAs0QQ


Chat formatted

In [40]:
import pandas as pd
import json

# Function to prepare data for chat-based fine-tuning
def prepare_chat_data(df):
    chat_data = []
    for _, row in df.iterrows():
        messages = [
            {"role": "system", "content": "You are an assistant that helps predict the time to resolve GitHub issues."},
            {"role": "user", "content": f"Title: {row['Issue Title']}\nBody: {row['Issue Body']}\nLabels: {row['Labels']}\nNumber of Comments: {row['Number of Comments']}\nComment Bodies: {row['Comment Bodies']}\nSocial Metrics: wordiness={row['wordiness']}, average_degree_centrality={row['average_degree_centrality']}, average_closeness={row['average_closeness']}, average_betweenness={row['average_betweenness']}, density={row['density']}, edges={row['edges']}, num_discussants={row['num_discussants']}\n"},
            {"role": "assistant", "content": row['Issue Lifetime']}
        ]
        chat_data.append({"messages": messages})
    return chat_data

# Prepare the chat data
chat_data = prepare_chat_data(data)

# Save as JSONL file
with open('chat_training_data.jsonl', 'w') as f:
    for entry in chat_data:
        json.dump(entry, f)
        f.write('\n')

# Print sample of the chat data
print(json.dumps(chat_data[:2], indent=2))


[
  {
    "messages": [
      {
        "role": "system",
        "content": "You are an assistant that helps predict the time to resolve GitHub issues."
      },
      {
        "role": "user",
        "content": "Title: enhancement request for macos add text that says \u201c\u2318 enter to submit lesson\nBody: is your feature request related to a problem please describesummary on macos platform it\u2019s not made clear that commandenter can be used to submit lessons despite the fact that this keyboard shortcut does work and is preferable i suggest making this clear to macbook the lessons say check your code ctrl enter on mac the ctrl key is in a slightly awkward position compared to the command key the command key is very easy and quick to hitthis might seem like a small request but i think that the change would be greatly appreciated and helpful to those who are a on macs and b appreciate keyboard shortcuts my personal story is i suffered through the lessons for awhile trying to get

Upload the Chat-Formatted Training File and Create a New Fine-Tune Job

In [41]:
import openai

# Read the OpenAI API key from a file
with open('openAiKey.txt', 'r') as file:
    api_key = file.read().strip()

# Initialize the OpenAI client
client = openai.OpenAI(api_key=api_key)

# Upload the chat-formatted training file
upload_response = client.files.create(
    file=open('chat_training_data.jsonl', 'rb'),
    purpose='fine-tune'
)

training_file_id = upload_response.id
print(f"Training File ID: {training_file_id}")

# Create a new fine-tune job
fine_tune_response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    model="gpt-3.5-turbo"
)

# Print response
print(fine_tune_response)


Training File ID: file-mgVn8vMoZkEUnxypkwArQq7b
FineTuningJob(id='ftjob-7KYfEjLJRIIzjxM0sgnaxp3g', created_at=1717748347, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-iXy8zWZJPm0q9LU8ZICnDE47', result_files=[], status='validating_files', trained_tokens=None, training_file='file-mgVn8vMoZkEUnxypkwArQq7b', validation_file=None, user_provided_suffix=None, seed=624297924, estimated_finish=None, integrations=[])


In [42]:
import time

# Function to check the status of the fine-tuning job
def check_fine_tune_status(job_id):
    status_response = client.fine_tuning.jobs.retrieve(job_id)
    return status_response

# Fine-tuning job ID
fine_tune_job_id = fine_tune_response.id
print(f"Fine-tuning Job ID: {fine_tune_job_id}")

# Monitor the fine-tuning job status
status = check_fine_tune_status(fine_tune_job_id)
print(f"Initial Status: {status.status}")

# Poll the status until the job is complete
while status.status not in ['succeeded', 'failed']:
    time.sleep(60)  # Wait for 60 seconds before checking the status again
    status = check_fine_tune_status(fine_tune_job_id)
    print(f"Current Status: {status.status}")

# Check the final status and retrieve model details if successful
if status.status == 'succeeded':
    fine_tuned_model = status.fine_tuned_model
    print(f"Fine-tuning succeeded. Fine-tuned Model: {fine_tuned_model}")
else:
    print("Fine-tuning failed. Please check the details and logs for more information.")


Fine-tuning Job ID: ftjob-7KYfEjLJRIIzjxM0sgnaxp3g
Initial Status: validating_files
Current Status: validating_files
Current Status: validating_files
Current Status: validating_files
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
Current Status: running
C

In [47]:
from sklearn.model_selection import train_test_split

# Assuming 'data' is your original DataFrame
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


In [48]:
import json

def prepare_chat_data(df):
    chat_data = []
    for _, row in df.iterrows():
        messages = [
            {"role": "system", "content": "You are an assistant that helps predict the time to resolve GitHub issues."},
            {"role": "user", "content": f"Title: {row['Issue Title']}\nBody: {row['Issue Body']}\nLabels: {row['Labels']}\nNumber of Comments: {row['Number of Comments']}\nComment Bodies: {row['Comment Bodies']}\nSocial Metrics: wordiness={row['wordiness']}, average_degree_centrality={row['average_degree_centrality']}, average_closeness={row['average_closeness']}, average_betweenness={row['average_betweenness']}, density={row['density']}, edges={row['edges']}, num_discussants={row['num_discussants']}\n"},
            {"role": "assistant", "content": row['Issue Lifetime']}
        ]
        chat_data.append({"messages": messages})
    return chat_data

# Prepare the chat data
chat_train_data = prepare_chat_data(train_data)

# Save as JSONL file
with open('chat_training_data.jsonl', 'w') as f:
    for entry in chat_train_data:
        json.dump(entry, f)
        f.write('\n')

# Print sample of the chat data
print(json.dumps(chat_train_data[:2], indent=2))


[
  {
    "messages": [
      {
        "role": "system",
        "content": "You are an assistant that helps predict the time to resolve GitHub issues."
      },
      {
        "role": "user",
        "content": "Title: challenege instructions does not match challenge code\nBody: challenge target even numbered elements using jquery has an issueuser agent is mozilla50 windows nt 100 win64 x64 rv520 gecko20100101 firefox520please describe how to reproduce this issue and include links to screenshots if possiblethe challenge instructions state note that jquery is zeroindexed meaning that counterintuitively odd selects the second element fourth element and so on try selecting all the evennumbered elements and giving them the classes of animated and shakehowever the checks sayyou should use the even function to modify these elements which means that the odd numbered elements assuming that the previous explanation is accurate would be changed not the even either the instruction text should 

In [51]:
import openai
import time

# Read the OpenAI API key from a file
with open('openAiKey.txt', 'r') as file:
    api_key = file.read().strip()

# Initialize the OpenAI client
client = openai.OpenAI(api_key=api_key)

# Fine-tuning job ID (replace with your actual fine-tuning job ID)
fine_tune_job_id = "ftjob-7KYfEjLJRIIzjxM0sgnaxp3g"

# Function to check the status of the fine-tuning job and retrieve the model ID
def check_fine_tune_status(job_id):
    status_response = client.fine_tuning.jobs.retrieve(job_id)
    return status_response

# Monitor the fine-tuning job status
status = check_fine_tune_status(fine_tune_job_id)
print(f"Initial Status: {status.status}")

# Poll the status until the job is complete
while status.status not in ['succeeded', 'failed']:
    time.sleep(60)  # Wait for 60 seconds before checking the status again
    status = check_fine_tune_status(fine_tune_job_id)
    print(f"Current Status: {status.status}")

# Check the final status and retrieve model details if successful
if status.status == 'succeeded':
    fine_tuned_model = status.fine_tuned_model
    print(f"Fine-tuning succeeded. Fine-tuned Model: {fine_tuned_model}")
else:
    print("Fine-tuning failed. Please check the details and logs for more information.")


Initial Status: succeeded
Fine-tuning succeeded. Fine-tuned Model: ft:gpt-3.5-turbo-0125:personal::9XQB3irt


In [98]:
# Export the DataFrame to a CSV file
data.to_csv('output.csv', index=False)
test_data.to_csv('test_output.csv', index=False)