In [1]:
import pandas as pd 
import numpy as np

import os
import re
from tqdm import trange
import pickle

root = 'E:\\FlowData\\'

In [2]:
def generate_WS_WD(filepath):
    pattern = re.compile(r'.*\\WS(\d+)\\.*\\WD(\d+).*')
    result = pattern.search(filepath)
    direction = np.int32(result.group(2))
    speed = np.int32(result.group(1))
    return (direction,speed)

def readOne(root,file):
    path = os.path.join(root,file)
    direction,speed = generate_WS_WD(root)
    if os.path.exists(path):
        data = pd.read_csv(path,header=0)
        data = np.array(data)
        data = data.astype(np.float32)
        direction_speed = np.repeat([[speed,direction]],data.shape[0],axis=0)
        data =np.concatenate((direction_speed,data),axis=1)
    else:
        print(path," not exist.")
        data = np.array([])
    data = data.astype(np.float32)
    return data,direction,speed

def read_all_files(root):
    WS_list = os.listdir(root)
    files = []
    for ws in WS_list:
        wspath = os.path.join(root,ws)
        if os.path.isdir(wspath):
            folderpath = os.path.join(wspath,'SingleSample')
            WD_list = os.listdir(folderpath)
            for wd in WD_list:
                wdpath = os.path.join(folderpath,wd)
                if os.path.isdir(wdpath):
                    file_list = os.listdir(wdpath)
                    for file in file_list:
                        if os.path.isfile(os.path.join(wdpath,file)):
                            files.append((wdpath,file))
        
    return files

def readData(root):
    train = []
    test =[]  
    pathnames = read_all_files(root)

    N = len(pathnames)
    for i in trange(N):
        dirpath,file = pathnames[i]
        onedata,direction,speed = readOne(dirpath,file)
        onedata = onedata[:,:-1]
        if direction in [10,70,100,270,330]:
            test.append(onedata)
        else:
            train.append(onedata)
    return train,test

In [3]:
train,test = readData(root)

def read_wall(data):
    data_wall = []
    for field in data:
        wall = field[np.where(field[:,9]==0)]
        data_wall.append(wall)

    return data_wall

def select_wd(data,wd):
    data_wd = []
    for field in data:
        if field[0][1] in wd:
            data_wd.append(field)
    return data_wd

def select_ws(data,ws):
    data_ws = []
    for field in data:
        if field[0][0] in ws:
            data_ws.append(field)
    return data_ws

def select_z(data,z):
    data_z = []
    for field in data:
        if field[0][4] in z:
            data_z.append(field)
    return data_z

train = select_wd(train,[0,60,120,180,240,300])

print(len(train))
print(len(test))

100%|██████████| 240/240 [00:10<00:00, 21.83it/s]

90
75





In [4]:
train_df = pd.DataFrame(np.concatenate(train),columns=['ws', 'wd', 'x','y','z','u','v','w','p']).drop_duplicates(subset=['ws', 'wd', 'x','y','z']).reset_index(drop=True)
print(train_df.shape)

# 根据风速进行分组，并找出每组每列的最小值
grouped_min = train_df.groupby('ws').agg('min').reset_index()
grouped_max = train_df.groupby('ws').agg('max').reset_index()
ws_df = grouped_min.iloc[:,0:1].copy() 
grouped_diff = ws_df.join(grouped_max.iloc[:, 1:] - grouped_min.iloc[:, 1:])

# train_df_with_minmax = train_df.merge(grouped_min,on='ws',suffixes=('','_min')).merge(grouped_diff,on='ws',suffixes=('','_diff')).merge(train_df.iloc[:,1:9], left_index= True, right_index=True ,suffixes=('','_orig'))
train_df_with_minmax = train_df.merge(grouped_min,on='ws',suffixes=('','_min')).merge(grouped_diff,on='ws',suffixes=('','_diff'))
print(train_df_with_minmax.shape)
train_df_with_minmax.head()

(264720, 9)
(264720, 25)


Unnamed: 0,ws,wd,x,y,z,u,v,w,p,wd_min,...,w_min,p_min,wd_diff,x_diff,y_diff,z_diff,u_diff,v_diff,w_diff,p_diff
0,1.0,0.0,4566.689941,2466.590088,50.0,-0.060196,0.082321,0.0962,-0.64706,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
1,1.0,0.0,4566.689941,2468.590088,50.0,-0.018959,0.102387,0.139222,-0.650885,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
2,1.0,0.0,4568.689941,2466.590088,50.0,-0.053497,0.058023,0.13998,-0.644847,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
3,1.0,0.0,4568.689941,2468.590088,50.0,-0.036447,0.082526,0.170594,-0.648247,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
4,1.0,0.0,4566.689941,2470.590088,50.0,-0.004608,0.122091,0.153222,-0.653046,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025


In [5]:
test_df = pd.DataFrame(np.concatenate(test),columns=['ws', 'wd', 'x','y','z','u','v','w','p']).drop_duplicates(subset=['ws', 'wd', 'x','y','z']).reset_index(drop=True)
# test_df_with_minmax = test_df.merge(grouped_min,on='ws',suffixes=('','_min')).merge(grouped_diff,on='ws',suffixes=('','_diff')).merge(test_df.iloc[:,1:9], left_index= True, right_index=True ,suffixes=('','_orig'))
test_df_with_minmax = test_df.merge(grouped_min,on='ws',suffixes=('','_min')).merge(grouped_diff,on='ws',suffixes=('','_diff'))
print(test_df.shape)
print(test_df_with_minmax.shape)
test_df_with_minmax.head()

(220600, 9)
(220600, 25)


Unnamed: 0,ws,wd,x,y,z,u,v,w,p,wd_min,...,w_min,p_min,wd_diff,x_diff,y_diff,z_diff,u_diff,v_diff,w_diff,p_diff
0,1.0,10.0,4566.689941,2466.590088,50.0,-0.06238,-0.007652,-0.023653,-0.599281,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
1,1.0,10.0,4566.689941,2468.590088,50.0,-0.063309,0.016334,0.017415,-0.59823,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
2,1.0,10.0,4568.689941,2466.590088,50.0,-0.065492,0.006018,-0.004079,-0.598812,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
3,1.0,10.0,4568.689941,2468.590088,50.0,-0.068083,0.026074,0.034907,-0.598336,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
4,1.0,10.0,4566.689941,2470.590088,50.0,-0.05659,0.036638,0.038525,-0.597992,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025


In [6]:
train_wall = train_df_with_minmax[train_df_with_minmax['u'] == 0]
train_nowall = train_df_with_minmax[train_df_with_minmax['u'] != 0]
print(train_wall.shape)
print(train_nowall.shape)

test_wall = test_df_with_minmax[test_df_with_minmax['u'] == 0]
test_nowall = test_df_with_minmax[test_df_with_minmax['u'] != 0]
print(test_wall.shape)
print(test_nowall.shape)

wall = pd.concat([train_wall,test_wall],ignore_index=True)
print(wall.shape)

(14640, 25)
(250080, 25)
(12200, 25)
(208400, 25)
(26840, 25)


In [7]:
rebuild = pd.DataFrame()
for z in test_nowall['z'].unique():
    df_z50 = test_nowall[(test_nowall['z'] == z) & (test_nowall['u'] != 0)]
    unique_xy = df_z50[['x', 'y','z']].drop_duplicates()
    sampled_xyz = unique_xy.sample(100)
    matched_data = test_nowall.merge(sampled_xyz, on=['x', 'y', 'z'], how='inner')
    # matched_data = matched_data.drop_duplicates(subset=['ws','wd','x','y','z'])

    rebuild = pd.concat([rebuild, matched_data], ignore_index=True)

test_nowall = pd.concat([rebuild, test_nowall]).drop_duplicates(keep=False)

print(rebuild.shape)
print(test_nowall.shape)
rebuild.head()

(7500, 25)
(200900, 25)


Unnamed: 0,ws,wd,x,y,z,u,v,w,p,wd_min,...,w_min,p_min,wd_diff,x_diff,y_diff,z_diff,u_diff,v_diff,w_diff,p_diff
0,1.0,10.0,4569.390137,2480.659912,50.0,-0.017482,0.100779,0.067234,-0.599017,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
1,1.0,10.0,4570.879883,2485.110107,50.0,-0.009022,-0.009451,0.114981,-0.599885,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
2,1.0,10.0,4585.069824,2472.050049,50.0,-0.018954,-0.026756,0.162773,-0.576292,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
3,1.0,10.0,4587.640137,2490.26001,50.0,-0.037358,0.192332,0.042865,-0.600809,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025
4,1.0,10.0,4587.959961,2489.51001,50.0,-0.03893,0.19756,0.04048,-0.599733,0.0,...,-0.938139,-0.773651,300.0,48.0,48.0,20.0,1.583555,1.439418,1.914735,0.794025


In [8]:
train_nowall = train_nowall.to_numpy()
test_nowall = test_nowall.to_numpy()
wall = wall.to_numpy()
rebuild = rebuild.to_numpy()
test_df_with_minmax = test_df_with_minmax.to_numpy()

train_nowall[:,1:9] = (train_nowall[:,1:9]-train_nowall[:,9:17])/train_nowall[:,17:25]
test_nowall[:,1:9] = (test_nowall[:,1:9]-test_nowall[:,9:17])/test_nowall[:,17:25]
wall[:,1:9] = (wall[:,1:9]-wall[:,9:17])/wall[:,17:25]
rebuild[:,1:9] = (rebuild[:,1:9]-rebuild[:,9:17])/rebuild[:,17:25]
test_df_with_minmax[:,1:9] = (test_df_with_minmax[:,1:9]-test_df_with_minmax[:,9:17])/test_df_with_minmax[:,17:25]

In [10]:
with open("NN_sample_data.pkl","wb") as file:
    pickle.dump(train_nowall,file)
    pickle.dump(test_nowall,file)
    pickle.dump(wall,file)
    pickle.dump(rebuild,file)
    pickle.dump(test_df_with_minmax,file)    