In [1]:
import os
import re
import requests
import jsonlines
import json
import time

from collections import defaultdict
import tokenize
from io import StringIO

import ast
import astor

In [2]:
def count_tokens(code):
    # 使用StringIO将代码字符串转换为文件对象
    code_io = StringIO(code)
    token_count = 0
    
    # 使用tokenize模块分析代码
    try:
        for token in tokenize.generate_tokens(code_io.readline):
            # 只统计非空的token
            if token.string.strip():  # 检查token是否为空
                token_count += 1
    except tokenize.TokenError as e:
        print(f"Tokenize error: {e}")

    return token_count

In [3]:
# 简化规则1：合并连续的多个相同值的赋值操作

def simplify_continuous_assignments(code):
    # 将代码按行分割
    lines = code.strip().splitlines()
    result = []
    
    i = 0
    while i < len(lines):
        current_line = lines[i]
        match = re.match(r'(\w+)\s*=\s*(\d+)', current_line)
        
        if match:
            var_list = [match.group(1)]
            value = match.group(2)
            i += 1
            
            # 检查后续行是否有相同的赋值
            while i < len(lines):
                next_match = re.match(r'(\w+)\s*=\s*' + value, lines[i])
                if next_match:
                    var_list.append(next_match.group(1))
                    i += 1
                else:
                    break
            
            # 将连续赋值的变量合并为一行
            result.append('='.join(var_list) + '=' + value)
        else:
            result.append(current_line)
            i += 1
    
    return '\n'.join(result)

In [4]:
# 简化规则2：多个变量引用删除的简化

def simplify_continuous_deletes(code):
    # 将代码按行分割
    lines = code.strip().splitlines()
    result = []
    
    i = 0
    while i < len(lines):
        current_line = lines[i]
        match = re.match(r'del\s+(\w+)', current_line, re.IGNORECASE)
        
        if match:
            var_list = [match.group(1)]
            i += 1
            
            # 检查后续行是否也是del语句
            while i < len(lines):
                next_match = re.match(r'del\s+(\w+)', lines[i], re.IGNORECASE)
                if next_match:
                    var_list.append(next_match.group(1))
                    i += 1
                else:
                    break
            
            # 将连续的del语句合并为一行
            result.append('del ' + ', '.join(var_list))
        else:
            result.append(current_line)
            i += 1
    
    return '\n'.join(result)

In [5]:
# 简化规则3：运算操作符简化，例如c=c+a简化成c+=a

def simplify_operations(code):
    # 定义正则表达式，匹配形如 c = c <op> a 的模式
    pattern = r'(\b\w+\b)\s*=\s*\1\s*(\+|-|\*|/|%|&|\||\^|<<|>>)\s*(\w+)'
    
    # 使用sub方法替换为 c <op>= a 的模式
    simplified_code = re.sub(pattern, r'\1 \2= \3', code)
    
    return simplified_code


In [6]:
# 简化规则4：简单条件语句简化

class IfElseToConditionalExpression(ast.NodeTransformer):
    def __init__(self):
        self.matched = False  # This will track whether any transformation was made

    def visit_If(self, node):
        # Process the if node
        self.generic_visit(node)

        # Check if the if-else structure matches our pattern
        if (len(node.body) == 1 and len(node.orelse) == 1 and
            isinstance(node.body[0], (ast.Assign, ast.Return, ast.Expr, ast.AugAssign)) and
            isinstance(node.orelse[0], (ast.Assign, ast.Return, ast.Expr, ast.AugAssign))):
            
            # Extract the statements in body and orelse
            true_statement = node.body[0]
            false_statement = node.orelse[0]

            # Check if both statements are of the same type
            if isinstance(true_statement, type(false_statement)):
                
                if isinstance(true_statement, ast.Assign) and isinstance(false_statement, ast.Assign):
                    # Simplify assignments
                    if self._compare_targets(true_statement.targets[0], false_statement.targets[0]):
                        new_statement = self._create_conditional_assignment(node, true_statement, false_statement)
                        self.matched = True  # Set the flag to True if a match and simplification happened
                        return new_statement

                elif isinstance(true_statement, ast.AugAssign) and isinstance(false_statement, ast.AugAssign):
                    # Simplify augmented assignments (+=, -=, etc.)
                    if self._compare_targets(true_statement.target, false_statement.target):
                        new_statement = self._create_conditional_aug_assign(node, true_statement, false_statement)
                        self.matched = True  # Set the flag to True if a match and simplification happened
                        return new_statement

                elif isinstance(true_statement, ast.Return) and isinstance(false_statement, ast.Return):
                    # Simplify return statements
                    new_statement = ast.Return(
                        value=ast.IfExp(
                            test=node.test,
                            body=true_statement.value,
                            orelse=false_statement.value
                        )
                    )
                    self.matched = True  # Set the flag to True if a match and simplification happened
                    return new_statement
                
                elif isinstance(true_statement, ast.Expr) and isinstance(false_statement, ast.Expr):
                    # Simplify expressions, including function calls (e.g., print, append)
                    new_statement = ast.Expr(
                        value=ast.IfExp(
                            test=node.test,
                            body=true_statement.value,
                            orelse=false_statement.value
                        )
                    )
                    self.matched = True  # Set the flag to True if a match and simplification happened
                    return new_statement
        
        return node

    def _compare_targets(self, target1, target2):
        """
        Compare two assignment targets. Supports variable names, attributes, and subscripts.
        """
        if isinstance(target1, ast.Name) and isinstance(target2, ast.Name):
            # Simple variable names
            return target1.id == target2.id
        elif isinstance(target1, ast.Attribute) and isinstance(target2, ast.Attribute):
            # Object attribute (e.g., obj.attr)
            return (target1.attr == target2.attr and
                    self._compare_targets(target1.value, target2.value))
        elif isinstance(target1, ast.Subscript) and isinstance(target2, ast.Subscript):
            # Subscript assignment (e.g., a[i])
            return self._compare_targets(target1.value, target2.value)
        else:
            # If the target types are different, we cannot simplify
            return False

    def _create_conditional_assignment(self, node, true_statement, false_statement):
        """
        Create a conditional assignment like: x = true_value if condition else false_value
        """
        target = true_statement.targets[0]
        true_value = true_statement.value
        false_value = false_statement.value

        # Create the new assignment with conditional expression
        return ast.Assign(
            targets=[target],
            value=ast.IfExp(
                test=node.test,
                body=true_value,
                orelse=false_value
            )
        )

    def _create_conditional_aug_assign(self, node, true_statement, false_statement):
        """
        Create a conditional assignment for augmented assignment like: x += y becomes x = x + y if condition else x + z
        """
        target = true_statement.target
        true_value = ast.BinOp(left=target, op=true_statement.op, right=true_statement.value)
        false_value = ast.BinOp(left=target, op=false_statement.op, right=false_statement.value)

        # Create the new assignment with conditional expression
        return ast.Assign(
            targets=[target],
            value=ast.IfExp(
                test=node.test,
                body=true_value,
                orelse=false_value
            )
        )


In [7]:
# 简化规则5：有字典映射的条件语句简化

class IfElseDictGetSimplifier(ast.NodeTransformer):
    def __init__(self):
        self.matched = False  # 记录是否进行了简化

    def visit_If(self, node):
        # Check if the test is "key in dictionary"
        if isinstance(node.test, ast.Compare) and isinstance(node.test.ops[0], ast.In):
            key = node.test.left
            dictionary = node.test.comparators[0]

            # Check if body has "value = dictionary[key]"
            if (len(node.body) == 1 and isinstance(node.body[0], ast.Assign) and
                isinstance(node.body[0].value, ast.Subscript) and
                isinstance(node.body[0].value.value, ast.Name) and
                node.body[0].value.value.id == dictionary.id and
                isinstance(node.body[0].value.slice, ast.Name) and
                node.body[0].value.slice.id == key.id):
                
                # Check if orelse has "value = default_value"
                if (len(node.orelse) == 1 and isinstance(node.orelse[0], ast.Assign)):
                    value_var = node.body[0].targets[0]
                    default_value = node.orelse[0].value

                    # Simplify to "value = dictionary.get(key, default_value)"
                    new_node = ast.Assign(
                        targets=[value_var],
                        value=ast.Call(
                            func=ast.Attribute(
                                value=ast.Name(id=dictionary.id, ctx=ast.Load()),
                                attr='get',
                                ctx=ast.Load()
                            ),
                            args=[ast.Name(id=key.id, ctx=ast.Load()), default_value],
                            keywords=[]
                        )
                    )
                    self.matched = True  # Set the flag to True if a match and simplification happened
                    return new_node

        return self.generic_visit(node)

In [8]:
# 简化规则6：多条件语句简化

class NestedIfSimplifier(ast.NodeTransformer):
    def __init__(self):
        self.matched = False  # 记录是否进行了简化

    def visit_If(self, node):
        # Process the body of the current if statement
        self.generic_visit(node)

        # Check if the `else` branch contains another `if` statement (nested if-else)
        if len(node.orelse) == 1 and isinstance(node.orelse[0], ast.If):
            nested_if = node.orelse[0]

            # Case 1: If both `if` and `nested if` perform the same type of operation (assign, return, print, append)
            if (len(node.body) == 1 and isinstance(node.body[0], (ast.Assign, ast.Return, ast.Expr)) and
                len(nested_if.body) == 1 and isinstance(nested_if.body[0], type(node.body[0])) and
                len(nested_if.orelse) == 1 and isinstance(nested_if.orelse[0], type(node.body[0]))):
                
                outer_op = node.body[0]
                inner_op_true = nested_if.body[0]
                inner_op_false = nested_if.orelse[0]

                # Case 1: Simplify assignment
                if isinstance(outer_op, ast.Assign):
                    if self._compare_targets(outer_op.targets[0], inner_op_true.targets[0]) and \
                       self._compare_targets(outer_op.targets[0], inner_op_false.targets[0]):
                        new_value = ast.IfExp(
                            test=node.test,
                            body=outer_op.value,
                            orelse=ast.IfExp(
                                test=nested_if.test,
                                body=inner_op_true.value,
                                orelse=inner_op_false.value
                            )
                        )
                        new_node = ast.Assign(targets=[outer_op.targets[0]], value=new_value)
                        self.matched = True
                        return new_node

                # Case 2: Simplify return statements
                elif isinstance(outer_op, ast.Return):
                    new_value = ast.IfExp(
                        test=node.test,
                        body=outer_op.value,
                        orelse=ast.IfExp(
                            test=nested_if.test,
                            body=inner_op_true.value,
                            orelse=inner_op_false.value
                        )
                    )
                    new_node = ast.Return(value=new_value)
                    self.matched = True
                    return new_node

                # Case 3: Simplify print and other function calls
                elif isinstance(outer_op, ast.Expr) and isinstance(outer_op.value, ast.Call):
                    if isinstance(outer_op.value.func, ast.Name) and \
                       outer_op.value.func.id == inner_op_true.value.func.id == inner_op_false.value.func.id:
                        # Correctly apply different arguments for each condition
                        true_arg = inner_op_true.value.args[0]
                        false_arg_true = inner_op_true.value.args[0]
                        false_arg = inner_op_false.value.args[0]

                        new_value = ast.IfExp(
                            test=node.test,
                            body=true_arg,
                            orelse=ast.IfExp(
                                test=nested_if.test,
                                body=false_arg_true,
                                orelse=false_arg
                            )
                        )
                        new_node = ast.Expr(
                            value=ast.Call(
                                func=outer_op.value.func,
                                args=[new_value],
                                keywords=[]
                            )
                        )
                        self.matched = True
                        return new_node

            # Case 2: Different operations, simplify to if-elif-else
            elif isinstance(node.body[0], type(nested_if.body[0])):
                new_elif = ast.If(
                    test=nested_if.test,
                    body=nested_if.body,
                    orelse=nested_if.orelse
                )
                node.orelse = [new_elif]
                self.matched = True
                return node

        return node

    def _compare_targets(self, target1, target2):
        """
        Compare two assignment targets. Supports variable names, attributes, and subscripts.
        """
        if isinstance(target1, ast.Name) and isinstance(target2, ast.Name):
            return target1.id == target2.id
        elif isinstance(target1, ast.Attribute) and isinstance(target2, ast.Attribute):
            return target1.attr == target2.attr and self._compare_targets(target1.value, target2.value)
        elif isinstance(target1, ast.Subscript) and isinstance(target2, ast.Subscript):
            return self._compare_targets(target1.value, target2.value)
        return False


In [9]:
# 简化规则7：for循环简化

class TransformForLoop(ast.NodeTransformer):
    def __init__(self):
        self.assignment_nodes = {}
        self.to_remove = set()
        self.matched = False

    def visit_Assign(self, node):
        # 识别形如 temp = [] 的赋值语句
        if (len(node.targets) == 1 and
            isinstance(node.targets[0], ast.Name) and
            isinstance(node.value, ast.List) and
            len(node.value.elts) == 0):  # Empty list
            # 记录赋值节点
            self.assignment_nodes[node.targets[0].id] = node
        return self.generic_visit(node)

    def visit_For(self, node):
        # 检查 for 循环的目标是否为记录的赋值节点
        if (isinstance(node.body[0], ast.Expr) and
            isinstance(node.body[0].value, ast.Call) and
            isinstance(node.body[0].value.func, ast.Attribute) and
            node.body[0].value.func.attr == 'append' and
            len(node.body[0].value.args) == 1 and
            isinstance(node.body[0].value.args[0], ast.Name)):
            
            # 获取 append 操作的目标变量名
            
            append_var = None
            if isinstance(node.body[0].value.func.value, ast.Name):
                append_var = node.body[0].value.func.value.id
           
            target_val = node.body[0].value.args[0].id
          
            if append_var in self.assignment_nodes:
                # 构建新的列表推导式
                list_comprehension = ast.ListComp(
                    elt=ast.Name(id=target_val, ctx=ast.Load()),
                    generators=[ast.comprehension(
                        target=node.target,
                        iter=node.iter,
                        ifs=[],
                        is_async=False
                    )]
                )
                
                # 找到赋值节点
                assignment_node = self.assignment_nodes[append_var]
                # 替换赋值节点的值为列表推导式
                assignment_node.value = list_comprehension
                # 标记要删除的赋值节点
                self.to_remove.add(assignment_node)
                self.matched = True
                # Remove the for loop node
                return None
        
        return self.generic_visit(node)

    def visit_Module(self, node):
        # 删除标记的赋值节点
        node.body = [stmt for stmt in node.body if stmt not in self.to_remove]
        return self.generic_visit(node)



In [10]:
# 简化规则8：return语句简化

def simplify_return_statements(code):
    # 正则表达式匹配带有单层括号的 return 语句
    pattern = r'\breturn\s*\(\s*([^()]*?)\s*\)\s*'
    
    # 替换括号，保持 return 后的内容
    simplified_code = re.sub(pattern, r'return \1', code)
    
    return simplified_code


In [11]:
#创建对象并添加到list中

def addItem_mbpp(item_list,i,simplified_code):
    sim_dict={}
    sim_dict['text']=i['text']
    sim_dict['code']=i['code']
    sim_dict['simplified_code']=simplified_code
    sim_dict['task_id']=i['task_id']
    sim_dict['test_list']=i['test_list']
    sim_dict['test_setup_code']=i['test_setup_code']
    sim_dict['challenge_test_list']=i['challenge_test_list']
    item_list.append(sim_dict)

In [25]:
#读数据集里的代码构造短代码
def extract_code(file_path):
    code_dict = {}
    for i in range(1,11):
        code_dict[i]=[]
        
    with open(file_path,"r",encoding='utf8') as f:
        data=jsonlines.Reader(f)
        for i in data:
        # for i,j in zip(data,range(5)):
            code=i['code']
            token_count = count_tokens(code)
        
            # #简化规则1
            simplified_code = simplify_continuous_assignments(code)
            sim_token_count= count_tokens(simplified_code)
            if sim_token_count<token_count:
                addItem_mbpp(code_dict[1],i,simplified_code)
        
            #简化规则2
            simplified_code = simplify_continuous_deletes(code)
            sim_token_count= count_tokens(simplified_code)
            if sim_token_count<token_count:
                addItem_mbpp(code_dict[2],i,simplified_code)
                
            #简化规则3
            simplified_code = simplify_operations(code)
            sim_token_count= count_tokens(simplified_code)
            if sim_token_count<token_count:
                addItem_mbpp(code_dict[3],i,simplified_code)
                
            #简化规则4，单条件语句简化
            tree = ast.parse(code)
            transformer = IfElseToConditionalExpression()
            transformed_tree = transformer.visit(tree)
            simplified = transformer.matched
            simplified_code = astor.to_source(transformed_tree)
            sim_token_count= count_tokens(simplified_code)
            if sim_token_count<token_count:
                addItem_mbpp(code_dict[4],i,simplified_code)
            
            #简化规则5，有字典映射的条件语句简化
            transformer = IfElseDictGetSimplifier()
            transformed_tree = transformer.visit(tree)
            simplified = transformer.matched
            simplified_code = astor.to_source(transformed_tree)
            sim_token_count= count_tokens(simplified_code)
            if sim_token_count<token_count:
                addItem_mbpp(code_dict[5],i,simplified_code)
            
            #简化规则6，多条件语句简化
            transformer = NestedIfSimplifier()
            transformed_tree = transformer.visit(tree)
            simplified = transformer.matched
            simplified_code = astor.to_source(transformed_tree)
            sim_token_count= count_tokens(simplified_code)
            if sim_token_count<token_count:
                addItem_mbpp(code_dict[6],i,simplified_code)
            
            #简化规则7，for循环简化
            transformer = TransformForLoop()
            transformed_tree = transformer.visit(tree)
            simplified = transformer.matched
            simplified_code = astor.to_source(transformed_tree)
            sim_token_count= count_tokens(simplified_code)
            if sim_token_count<token_count:
                addItem_mbpp(code_dict[7],i,simplified_code)
            
            #简化规则10，return语句简化
            simplified_code = simplify_return_statements(code)
            sim_token_count= count_tokens(simplified_code)
            if sim_token_count<token_count:
                addItem_mbpp(code_dict[8],i,simplified_code)
            
    return code_dict
        


In [23]:
def main():
    file_path_mbpp='./MBPP/mbpp.jsonl'
    short_code=extract_code(file_path_mbpp)
    # print(short_code)
    for i in range(1,9):
        file_path='./MBPP/short_code_'+str(i)+'.jsonl'
        with open(file_path, 'w') as jsonl_file:
            for item in short_code[i]:
                jsonl_file.write(json.dumps(item) + '\n')

In [26]:
if __name__ == '__main__':
    main()