# 从xml文件提取格式化数据到csv文件
输出文件为:  
    1.后续操作源数据origin_train_data.csv  
    2.后续操作测试数据  

In [None]:
import glob
import pandas as pd
from bs4 import BeautifulSoup

In [3]:
XML_PATH = "/media/HDD3/xxx/360DataCon/stage1_ori_xml/"  # 原始xml文件读取路径
OUTPUT_PATH = "/media/HDD3/xxx/360DataCon/stage1_extract_data/" # 输出csv文件存储路径

In [None]:
def yield_orgin_csv(pd_name, file_type):
    """
        从原始xml文件读取api名序列、call_pid序列、返回值序列、exinfos序列、标签等内容
        return Dataframe
    """
    
    all_api_names, all_call_pids, all_ret_values, all_exinfos = [], [], [], []
    flag = 1
    for path in glob.glob("{}/train/{}/*".format(XML_PATH,file_type)):
        with open(path, "r") as fp:
            xml = BeautifulSoup(fp, "lxml")
        action = xml.find_all("action")
        api_names, call_pids, ret_values, exinfos = [], [], [], []
        for a in action:
            try:
                api_names.append(a.get("api_name"))
            except:
                api_names.append("")
            try:
                call_pids.append(a.get("call_pid"))
            except:
                call_pids.append("")
            try:
                ret_values.append(a.get("ret_value"))
            except:
                ret_values.append("")
            try:
                for exinfo in a.find_all("exinfo"):
                    exinfos.append(exinfo.get("value"))
            except:
                exinfos.append("")
        try:
            all_api_names.append(" ".join(api_names))
        except:
            all_api_names.append("")
        try:
            all_call_pids.append(" ".join(call_pids))
        except:
            all_call_pids.append("")
        try:
            all_ret_values.append(" ".join(ret_values))
        except:
            all_ret_values.append("")
        try:
            all_exinfos.append(" ".join(exinfos))
        except:
            all_exinfos.append("")
        
        
        if flag % 300 == 0:
            print(flag)
        flag += 1
        
    pd_name["api_name"] = all_api_names
    pd_name["call_pid"] = all_call_pids
    pd_name["ret_value"] = all_ret_values
    pd_name["exinfos"] = all_exinfos
    pd_name.to_csv("{}{}_data.csv".format(OUTPUT_PATH,file_type), index=False, encoding="utf-8")

In [None]:
white_data = pd.DataFrame()
black_data = pd.DataFrame()

# 获取白样本解析结果
yield_orgin_csv(white_data, "white")
# 对黑样本进行解析
yield_orgin_csv(black_data, "black")

In [None]:
# 拼接black_data和white_data
black_data = pd.read_csv(OUTPUT_PATH+"black_data.csv", encoding="iso8859-1")
white_data = pd.read_csv(OUTPUT_PATH+"white_data.csv", encoding="utf-8")
origin_train_data = pd.concat([black_data, white_data])
safe_type = [1 for _ in range(black_data.shape[0])] + [0 for _ in range(white_data.shape[0])]
origin_train_data["safe_type"] = safe_type

origin_train_data.to_csv(OUTPUT_PATH+"origin_train_data.csv", encoding="utf-8", index=False)

### 提出结果形式

In [8]:
pd.read_csv(OUTPUT_PATH+"origin_train_data.csv",encoding="utf-8",nrows=10)

Unnamed: 0,api_name,call_pid,ret_value,exinfos,safe_type
0,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 0 0 0 0 0 0 c0000034 0 c0000034 0 0 0 0 ...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
1,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 0 0 0 0 0 0 0 c0000034 0 c0000034 0 0 0 ...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
2,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Fak...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 0 0 0 0 0 0 0 0 c0000034 0 c0000034 0 0 ...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
3,AnalyzeStart Fake_BeCreatedEx TryToAnalyze NtQ...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 c0000034 0 0 c0000034 0 0 0 0 0 0 0 0 0 ...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
4,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Fak...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 0 0 0 0 0 0 0 0 c0000034 0 c0000034 0 0 ...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
5,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 c0000034 0 c000003...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
6,AnalyzeStart Fake_BeCreatedEx TryToAnalyze NtQ...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 c0000034 0 0 0 0 0 c0000034 0 0 0 0 c000...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
7,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 0 0 0 0 0 0 0 0 0 c0000034 0 c0000034 0 ...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
8,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Loa...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 0 0 0 0 0 0 c0000034 0 c0000034 0 0 c000...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
9,AnalyzeStart Fake_BeCreatedEx TryToAnalyze Unp...,0 396 396 396 396 396 396 396 396 396 396 396 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 c0000034 0 c000003...,"SyStem.exe 212 ""C:\program\1.exe"" direct C:\p...",1
