In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import ast


# 1. 加载主数据集
data = pd.read_csv("CarGroup/0.csv",header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:09:49,2,3,1,0.0,64897.0,560.0,0.2,100,...,5858,121.33633,30.77259,3.669,3.436,35,32,[1],[-15000],[-500.0]
1,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:10:19,1,3,1,45.0,64899.0,527.0,-16.0,95,...,7268,121.34016,30.77230,3.310,3.289,30,27,[1],[2265],[-54.5]
2,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:10:49,1,3,1,21.0,64899.0,524.0,-3.1,94,...,7511,121.34265,30.77181,3.294,3.267,30,27,[1],[1078],[-48.0]
3,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:11:19,1,3,1,40.0,64899.0,508.0,50.1,93,...,6823,121.34280,30.76931,3.213,3.149,30,27,[1],[2457],[75.6]
4,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:11:49,1,3,1,52.0,64900.0,514.0,48.1,93,...,8649,121.34597,30.76920,3.241,3.182,30,27,[1],[3068],[67.4]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8708986,0ff938307317815a9feb862ffadde60b,2020-12-31 23:29:41,2,1,2,0.0,39512.0,370.3,-0.9,100,...,2603,0.00000,0.00000,1.000,1.000,-40,-40,,,
8708987,0ff938307317815a9feb862ffadde60b,2020-12-31 23:29:51,2,1,2,0.0,39512.0,370.3,-1.0,100,...,2603,0.00000,0.00000,1.000,1.000,-40,-40,,,
8708988,0ff938307317815a9feb862ffadde60b,2020-12-31 23:30:01,2,1,2,0.0,39512.0,370.3,-1.0,100,...,2603,0.00000,0.00000,1.000,1.000,-40,-40,,,
8708989,0ff938307317815a9feb862ffadde60b,2020-12-31 23:30:11,2,1,2,0.0,39512.0,370.3,-1.0,100,...,2603,0.00000,0.00000,1.000,1.000,-40,-40,,,


In [4]:



# 定义列名
column_names = [
    'vin','数据采集时间', '车辆状态', '充电状态', '运行模式', '车速', '累积里程', '总电压', '总电流',
    'SOC', 'DC-DC状态', '档位', '绝缘电阻', '经度', '纬度', '电池单体电压最高值',
    '电池单体电压最低值', '最高温度值', '最低温度值', '驱动电机序号', '驱动电机转速', '驱动电机转矩'
]

# 插入列名
data.columns = column_names

# 2. 主数据预处理
# 对于非数值数据列，如车辆状态、充电状态、运行模式、DC-DC状态、次行类型等，进行编码
data['车辆状态'] = data['车辆状态'].apply(lambda x: int(x) if x != '255' else -1)  # 处理异常值
data['充电状态'] = data['充电状态'].apply(lambda x: int(x) if x != '255' else -1)
data['运行模式'] = data['运行模式'].apply(lambda x: int(x) if x != '255' else -1)
data['DC-DC状态'] = data['DC-DC状态'].apply(lambda x: int(x) if x != '255' else -1)
# data['次行类型'] = data['次行类型'].apply(lambda x: int(x) if x != 'NULL' else -1)

# 去除含有空数据的行
data = data[data['驱动电机序号'] != 'None']

# # 将主数据中包含方括号的字符串转换为数值列表
# data['驱动电机序号'] = data['驱动电机序号'].apply(lambda x: ast.literal_eval(x))
# data['驱动电机转速'] = data['驱动电机转速'].apply(lambda x: ast.literal_eval(x))
# data['驱动电机转矩'] = data['驱动电机转矩'].apply(lambda x: ast.literal_eval(x))

# # 遍历来获得所有列的唯一值
# for column in data.columns:
#     unique_values = data[column].unique()
#     print(f"Column {column} unique values: {unique_values}")

data

Unnamed: 0,vin,数据采集时间,车辆状态,充电状态,运行模式,车速,累积里程,总电压,总电流,SOC,...,绝缘电阻,经度,纬度,电池单体电压最高值,电池单体电压最低值,最高温度值,最低温度值,驱动电机序号,驱动电机转速,驱动电机转矩
0,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:09:49,2,3,1,0.0,64897.0,560.0,0.2,100,...,5858,121.33633,30.77259,3.669,3.436,35,32,[1],[-15000],[-500.0]
1,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:10:19,1,3,1,45.0,64899.0,527.0,-16.0,95,...,7268,121.34016,30.77230,3.310,3.289,30,27,[1],[2265],[-54.5]
2,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:10:49,1,3,1,21.0,64899.0,524.0,-3.1,94,...,7511,121.34265,30.77181,3.294,3.267,30,27,[1],[1078],[-48.0]
3,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:11:19,1,3,1,40.0,64899.0,508.0,50.1,93,...,6823,121.34280,30.76931,3.213,3.149,30,27,[1],[2457],[75.6]
4,000cdb8748c791b5b57f2fe4c5030f71,2020-11-01 08:11:49,1,3,1,52.0,64900.0,514.0,48.1,93,...,8649,121.34597,30.76920,3.241,3.182,30,27,[1],[3068],[67.4]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8708986,0ff938307317815a9feb862ffadde60b,2020-12-31 23:29:41,2,1,2,0.0,39512.0,370.3,-0.9,100,...,2603,0.00000,0.00000,1.000,1.000,-40,-40,,,
8708987,0ff938307317815a9feb862ffadde60b,2020-12-31 23:29:51,2,1,2,0.0,39512.0,370.3,-1.0,100,...,2603,0.00000,0.00000,1.000,1.000,-40,-40,,,
8708988,0ff938307317815a9feb862ffadde60b,2020-12-31 23:30:01,2,1,2,0.0,39512.0,370.3,-1.0,100,...,2603,0.00000,0.00000,1.000,1.000,-40,-40,,,
8708989,0ff938307317815a9feb862ffadde60b,2020-12-31 23:30:11,2,1,2,0.0,39512.0,370.3,-1.0,100,...,2603,0.00000,0.00000,1.000,1.000,-40,-40,,,


In [5]:
data2 = pd.read_csv("trip_data.csv")

# 筛选出第二个数据集中 trip_kind 列值为 'D' 的数据
filtered_data2 = data2[data2['trip_kind'] == 'D']

# 初始化一个空的DataFrame，用于存储筛选后的结果
final_data = pd.DataFrame(columns=data.columns)

# 遍历第一个数据集
for index, row in data.iterrows():
    vin = row['vin']
    collection_time = row['数据采集时间']

    # 查找是否有相同 vin 值的数据在第二个数据集中
    matching_data2 = filtered_data2[filtered_data2['vin'] == vin]

    # 遍历匹配的数据
    for _, matching_row in matching_data2.iterrows():
        start_time = matching_row['start_collectiontime']
        end_time = matching_row['end_collectiontime']

        # 如果数据采集时间在 start_time 和 end_time 之间，将该行添加到结果DataFrame
        if start_time <= collection_time <= end_time:
            final_data = final_data.append(row, ignore_index=True)

# 打印筛选后的结果
print(final_data)

In [None]:
pd.to_csv(final_data,"final_data.csv")