In [32]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Use a smaller, faster model
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to("cpu")

MAX_TOKENS = 16384

def review_pr_ds(title, description, code_diff):
    prompt = f"""
    You are a senior software engineer reviewing a pull request.

    **PR Title:** {title}
    **PR Description:** {description}
    **Code Diff:**
    ```
    {code_diff}
    ```

    Please review this PR and provide:
    - **Summary** of the changes.
    - **Potential issues** (e.g., performance, security, code style).
    - **Suggestions for improvement**.
    - **Final Recommendation**: Approve, Request Changes, or Needs Discussion.

    Provide only **one** response and do not repeat information..
    """
    token_count = len(tokenizer.encode(prompt))  
    if token_count > MAX_TOKENS:
        return "SKIPPED: Input too long"
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    output = model.generate(**inputs, max_new_tokens=500)
    return tokenizer.decode(output[:, inputs["input_ids"].shape[1]:][0], skip_special_tokens=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
title = "Fix memory leak in data pipeline"
description = "This PR fixes a memory leak in the data pipeline by refactoring the buffer management logic."
code_diff = """- buffer = load_data()
+ with load_data() as buffer:"""

review = review_pr_ds(title, description, code_diff)
print(review)


Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


```

---

**Response:**

**Summary**: This PR fixes a memory leak in the data pipeline by refactoring the buffer management logic. The original code was using a `with` statement to load data into a buffer, which was causing a memory leak. The refactored code uses a `with` statement to ensure that the buffer is properly cleaned up after use.

**Potential issues**: The potential issue with this change is that it could lead to memory leaks if the buffer is not properly cleaned up after use. This could potentially slow down the data pipeline and potentially lead to memory usage issues.

**Suggestions for improvement**: One suggestion for improvement is to use a `try/finally` block instead of a `with` statement to ensure that the buffer is properly cleaned up after use. This would prevent potential memory leaks.

**Final Recommendation**: Approve.



In [24]:
pr_title = "Title: Fixed a bug - no panic anymore when logining in without TTY"
pr_description = """Fixes #8956

Signed-off-by: Marianna mtesselh@gmail.com
"""
code_diff = """diff --git a/api/client/commands.go b/api/client/commands.go
index 6b9c4d4d8fbb5..dfc07835c9016 100644
--- a/api/client/commands.go
+++ b/api/client/commands.go
@@ -289,7 +289,10 @@ func (cli *DockerCli) CmdLogin(args ...string) error {
 	// the password or email from the config file, so prompt them
 	if username != authconfig.Username {
 		if password == "" {
-			oldState, _ := term.SaveState(cli.inFd)
+			oldState, err := term.SaveState(cli.inFd)
+			if err != nil {
+				return err
+			}
 			fmt.Fprintf(cli.out, "Password: ")
 			term.DisableEcho(cli.inFd, oldState)
 
diff --git a/integration-cli/docker_cli_login_test.go b/integration-cli/docker_cli_login_test.go
new file mode 100644
index 0000000000000..cf134e4c9b39c
--- /dev/null
+++ b/integration-cli/docker_cli_login_test.go
@@ -0,0 +1,35 @@
+package main
+
+import (
+	"bytes"
+	"io"
+	"os"
+	"os/exec"
+	"testing"
+)
+
+func TestLoginWithoutTTY(t *testing.T) {
+	cmd := exec.Command(dockerBinary, "login")
+	// setup STDOUT and STDERR so that we see any output and errors in our console
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	// create a buffer with text then a new line as a return
+	buf := bytes.NewBuffer([]byte("buffer test string \n"))
+
+	// use a pipe for stdin and manually copy the data so that
+	// the process does not get the TTY
+	in, err := cmd.StdinPipe()
+	if err != nil {
+		t.Fatal(err)
+	}
+	// copy the bytes into the commands stdin along with a new line
+	go io.Copy(in, buf)
+
+	// run the command and block until it's done
+	if err := cmd.Run(); err == nil {
+		t.Fatal("Expected non nil err when loginning in & TTY not available")
+	}
+
+	logDone("login - login without TTY")
+}
"""
pr_review = review_pr_ds(pr_title, pr_description, code_diff)
print(pr_review)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


```

Reviewer: Marianna mtesselh@gmail.com

    **Response:**

    **Summary:**
    The PR introduces a new test case that checks if the Docker CLI login command works correctly when run without a TTY. The test case uses a pipe to manually copy the data into the command's stdin, which prevents the command from getting the TTY. This change improves the test coverage and ensures that the login command works correctly when run without a TTY.

    **Potential issues:**
    - The test case could potentially fail if the command's stdin is not properly managed.
    - The test case could potentially be slow if the command's stdin is large.

    **Suggestions for improvement:**
    - Improve the test case to handle larger inputs.
    - Improve the test case to handle errors in the command's stdin.

    **Final Recommendation:**
    - Approve the PR.

    **Comment:**
    The PR is well-reviewed and does a good job of covering the test case. The test case is efficient and covers all the potentia

In [25]:
import torch
torch.cuda.empty_cache()

In [33]:
import pandas as pd
import json

# Load JSON data (replace 'your_file.json' with the actual file)
with open('/kaggle/input/public-pr-final/public_pr_final.json', 'r') as f:
    data = json.load(f)

# Convert JSON to DataFrame
df = pd.DataFrame(data)

df_sampled = df.sample(n=10, random_state=42)

# Function to get LLM review
def get_llm_review(row):
    review = review_pr_ds(row['pr_title'], row['pr_description'], row['pr_diff'])
    print(review)
    return review

df_sampled['llm_review'] = df_sampled.apply(get_llm_review, axis=1)

Token indices sequence length is longer than the specified maximum sequence length for this model (19285 > 16384). Running this sequence through the model will result in indexing errors


SKIPPED: Input too long


Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


SKIPPED: Input too long


Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.



    **Response:**
    **Summary:** The proposed changes allow the user to select the subnet for a single availability zone machine pool.
    **Potential issues:** The proposed changes may affect the performance of the system as it requires fetching all the private subnets for a single availability zone.
    **Suggestions for improvement:** The proposed changes could be improved by fetching the private subnets for the availability zone in a batch, which could improve the performance.
    **Final Recommendation:** Approve.



Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.



    **Response:**
    **Summary**:
    The PR introduces a new feature: an RSA accumulator. The accumulator is an interface for a cryptographic accumulator that can accumulate data and verify if some data was added to the accumulator. The new feature allows for more efficient and secure data storage and verification.

    **Potential issues**:
    - **Performance**: The current implementation of the accumulator might not be the most efficient for large data sets.
    - **Security**: The current implementation of the accumulator might not be secure enough for all types of data.
    - **Code style**: The current code style might not be as clean or readable as it could be.

    **Suggestions for improvement**:
    - **Performance**: Improve the performance of the accumulator by using more efficient algorithms or data structures.
    - **Security**: Improve the security of the accumulator by using stronger cryptographic algorithms or implementing additional checks.
    - **Code style**: I

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


```

    **Summary:**
    - The PR introduces support for Kinesis Firehose delivery stream in AWS.
    - The current implementation only supports Redshift destination.
    - The PR includes support for Elasticsearch and S3 destinations.
    - The PR includes acceptance tests for the new features.

    **Potential issues**:
    - The current implementation may not handle all edge cases.
    - The current implementation may not support all AWS services.

    **Suggestions for improvement**:
    - Implement support for all AWS services.
    - Improve the acceptance tests.

    **Final Recommendation**: Approve.

    **Code Diff:**
    ```
    diff --git a/aws/resource_aws_kinesis_firehose_delivery_stream.go b/aws/resource_aws_kinesis_firehose_delivery_stream.go
    index 2dcacb71b24f..eaef7ff3190b 100644
    --- a/aws/resource_aws_kinesis_firehose_delivery_stream.go
    +++ b/aws/resource_aws_kinesis_firehose_delivery_stream.go
    @@ -1,14 +1,17 @@
    package aws

    import (
    "byte

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.



    **Response:**
    **Summary:**
    The pull request fixes a deadlock issue that was caused during the server shutdown. The deadlock was caused by the RPC server trying to write to a channel that was already closed. The fix involves running the RPC server in a separate goroutine to avoid the deadlock.

    **Potential issues:**
    - The deadlock issue could potentially be caused by the RPC server trying to write to a channel that was already closed.
    - The server shutdown could potentially block the RPC server from writing to the channel, causing the server to hang.

    **Suggestions for improvement:**
    - Implement a mechanism to handle the case where the RPC server is already shutting down.
    - Implement a mechanism to handle the case where the RPC server is shutting down but there are still pending writes to the channel.

    **Final Recommendation:**
    - Approve.

    **Code Diff:**
    ```
    diff --git a/cli/server/server.go b/cli/server/server.go
    index bcea6b

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


SKIPPED: Input too long
```

    **Reviewer Comments:**
    - The PR is well-written and follows the best practices for Go code.
    - The `ValidateToken` method is added to the `InteractionCallback` struct.
    - The `ValidateToken` method is used to validate the verification tokens.
    - The `TestInteractionCallback_ValidateToken` test is added to validate the `ValidateToken` method.
    - The PR is ready for review.

    **Approval**: Approved.

    **Reviewer Feedback:**
    - The PR is well-written and follows the best practices for Go code.
    - The `ValidateToken` method is added to the `InteractionCallback` struct.
    - The `ValidateToken` method is used to validate the verification tokens.
    - The PR is ready for review.

    **Comment:**
    - The PR is ready for review.

    **Reviewer Assessment:**
    - The PR is well-written and follows the best practices for Go code.
    - The `ValidateToken` method is added to the `InteractionCallback` struct.
    - The `ValidateTo

In [40]:
df_sampled.to_json('pr_reviews.json', orient='records', indent=4, force_ascii=False)
# Read the file and remove escaping
with open('pr_reviews.json', 'r') as f:
    json_data = f.read().replace('\\/', '/')

# Write back the cleaned JSON
with open('pr_reviews.json', 'w') as f:
    f.write(json_data)

In [34]:
print(df_sampled.head(5))

                                              pr_title  \
521           metrics: support usage inside CSI driver   
737                                    Bump 1.26 proof   
740  SDA-7895 Enable editing subnet in machinepools...   
660                        Feat/en 577 rsa accumulator   
411  Add import support for kinesis firehose delive...   

                                        pr_description  \
521  **What type of PR is this?**\r\n/kind feature\...   
737                                               None   
740  ROSA CLI UX for\r\nhttps://issues.redhat.com/b...   
660  crypto - add values to rsa accumulator; verify...   
411  @radeksimko Right now, I've just added import ...   

                                              html_url  \
521  https://github.com/kubernetes-csi/csi-lib-util...   
737  https://github.com/openshift/openshift-apiserv...   
740  https://github.com/openshift/rosa/pull/1048#di...   
660  https://github.com/multiversx/mx-chain-go/pull...   
411  https:/

In [41]:
from sentence_transformers import SentenceTransformer, util
import pandas as pd

# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # Efficient and fast

# Function to compute similarity
def compute_similarity(row):
    llm_review = row['llm_review']
    original_comment = row['body']  
    
    if "SKIPPED: Input too long" in llm_review or not llm_review or not original_comment:
        return None  # Mark as None to drop later

    # Convert both texts to embeddings
    emb1 = model.encode(llm_review, convert_to_tensor=True)
    emb2 = model.encode(original_comment, convert_to_tensor=True)

    # Compute cosine similarity
    similarity = util.pytorch_cos_sim(emb1, emb2).item()
    
    return similarity  # Score between 0 and 1

# Apply function to DataFrame
df_sampled['similarity_score'] = df_sampled.apply(compute_similarity, axis=1)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [44]:
df_sampled.to_json('pr_reviews_with_similarity.json', orient='records', indent=4, force_ascii=False)
# Read the file and remove escaping
with open('pr_reviews_with_similarity.json', 'r') as f:
    json_data = f.read().replace('\\/', '/')

# Write back the cleaned JSON
with open('pr_reviews_with_similarity.json', 'w') as f:
    f.write(json_data)

In [42]:
print(df_sampled)

                                              pr_title  \
521           metrics: support usage inside CSI driver   
737                                    Bump 1.26 proof   
740  SDA-7895 Enable editing subnet in machinepools...   
660                        Feat/en 577 rsa accumulator   
411  Add import support for kinesis firehose delive...   
678  rpc: Fix deadlock produced during server shutdown   
626                              Rework embedded forms   
513            koord-scheduler: add reservation plugin   
859        [Update] Update Interactions Message Method   
136       feat(vote): vote module and evm vote handler   

                                        pr_description  \
521  **What type of PR is this?**\r\n/kind feature\...   
737                                               None   
740  ROSA CLI UX for\r\nhttps://issues.redhat.com/b...   
660  crypto - add values to rsa accumulator; verify...   
411  @radeksimko Right now, I've just added import ...   
678  * closes

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Use a smaller, faster model
model_name = "deepseek-ai/deepseek-coder-1.3b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to("cpu")

MAX_TOKENS = 16384

def compare_with_llm(llm_review, original_comment):
    if "SKIPPED: Input too long" in llm_review or not llm_review or not original_comment:
        return None  # Skip invalid rows

    prompt = f"""
    You are an AI designed to compare two PR reviews and output a similarity score. You will be given two texts, contextually analyze the similarity between the two texts 
    and return a similarity score between 0 and 1. 

    **Text 1 :** {llm_review}
    **Text 2 :** {original_comment}
    
    Please respond with a similarity score and nothing else. 
    """

    token_count = len(tokenizer.encode(prompt))  
    if token_count > MAX_TOKENS:
        return "SKIPPED: Input too long"
    inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
    output = model.generate(**inputs, max_new_tokens=200)
    return tokenizer.decode(output[:, inputs["input_ids"].shape[1]:][0], skip_special_tokens=True)

print(compare_with_llm("""**Response:**
    **Summary:** The proposed changes allow the user to select the subnet for a single availability zone machine pool.
    **Potential issues:** The proposed changes may affect the performance of the system as it requires fetching all the private subnets for a single availability zone.
    **Suggestions for improvement:** The proposed changes could be improved by fetching the private subnets for the availability zone in a batch, which could improve the performance.
    **Final Recommendation:** Approve.""", "** Does this feature is supported only for single AZ clusters?"))

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.



    **Similarity Score:** 0.7

This is a simple example of how you can use the AI to compare two texts. The AI's output is a similarity score between 0 and 1, where 0 means the texts are completely different and 1 means they are identical.

Please note that the AI model used here is a simple one and may not be able to handle more complex tasks or nuances in the language used in the texts.



In [5]:
import pandas as pd
import json

# Load the JSON file
with open("pr_reviews.json", "r") as file:
    data = json.load(file)  # Load JSON data into a dictionary

# Convert to DataFrame
df_sampled = pd.DataFrame(data)


def get_similarity(row):
    output = compare_with_llm(row['llm_review'], row['body'])
    print(output)
    return output

df_sampled['similarity_from_llm'] = df_sampled.apply(get_similarity, axis=1)

Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


None
None


Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.



    **Similarity Score:** 0.7

This is a simple example of how you can use an AI model to compare two texts and output a similarity score. The AI model is trained to understand the context and semantics of the text, and it can provide a similarity score between 0 and 1.

Please note that the actual implementation of this feature depends on the specific AI model and the programming language you are using.



Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.



    **Score :** 0.8

This is a simple example of how you can use an AI model to compare two texts. The AI model is trained to understand the context and semantics of the text, and it can provide a similarity score between 0 and 1. The score is a measure of how similar the two texts are in terms of their content and structure.



Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.


```

    **Similarity Score:** 0.7

    **Response:** I'm sorry, but as an AI, I don't have the ability to respond to code reviews or provide feedback on code changes. I suggest you to review the code and provide feedback to the author.

Please note that the similarity score is a measure of how similar the two texts are in terms of content and structure. It's not a perfect measure of how similar the two texts are in terms of their actual content or functionality.



Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.



    **Similarity Score:** 0.8

    **Response:**
    **Summary:**
    The pull request fixes a deadlock issue that was caused during the server shutdown. The deadlock was caused by the RPC server trying to write to a channel that was already closed. The fix involves running the RPC server in a separate goroutine to avoid the deadlock.

    **Potential issues:**
    - The deadlock issue could potentially be caused by the RPC server trying to write to a channel that was already closed.
    - The server shutdown could potentially block the RPC server from writing to the channel, causing the server to hang.

    **Suggestions for improvement:**
    - Implement a mechanism to handle the case where the RPC server is already shutting down.
    - Implement a mechanism to handle the case where the RPC server is shutting


Setting `pad_token_id` to `eos_token_id`:32021 for open-end generation.



    **Response:** 0.75

    **Explanation:** The similarity score is a float between 0 and 1. A score of 0 means the texts are completely different, while a score of 1 means the texts are identical. In this case, the similarity score is 0.75, indicating a moderate degree of similarity between the two texts.

None

    **Similarity Score:** 0.8

    **Response:** I'm sorry, but as an AI, I don't have the ability to compare two PR reviews. I can only provide information and answer questions based on the data I was trained on.

This is a simple example of how you can use the similarity score to compare two texts. In a real-world scenario, you would need to use a more sophisticated method to compare the texts, such as using a machine learning model trained to understand semantic similarity.

None


In [6]:
df_sampled.to_json('pr_reviews_with_similarity_from_llm.json', orient='records', indent=4, force_ascii=False)
# Read the file and remove escaping
with open('pr_reviews_with_similarity_from_llm.json', 'r') as f:
    json_data = f.read().replace('\\/', '/')

# Write back the cleaned JSON
with open('pr_reviews_with_similarity_from_llm.json', 'w') as f:
    f.write(json_data)