In [2]:
import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import re

# Unified data directories
Datadir = r'C:\Users\Administrator\Documents\MnewData'
output_path = r'C:\Users\Administrator\PYMo\Data'

def list_csv_files(directory, keyword):
    """List CSV files in the specified directory that contain the keyword."""
    return [file for file in os.listdir(directory) if keyword in file and file.endswith('.csv')]

def read_and_process_files(directory, keyword, columns=None, encoding='gbk'):
    """Read and process CSV files, returning a concatenated DataFrame."""
    files = list_csv_files(directory, keyword)
    dfs = []
    for file in files:
        file_path = os.path.join(directory, file)
        try:
            df = pd.read_csv(file_path, encoding=encoding, usecols=columns, na_values=["n/a", "na", "-"]) if columns else pd.read_csv(file_path, encoding=encoding, skiprows=2, header=0, na_values=["n/a", "na", "-"])
            dfs.append(df)
        except Exception as e:
            print(f"Error reading file {file}: {e}")
    return pd.concat(dfs, ignore_index=True).drop_duplicates() if dfs else pd.DataFrame()

def read_hubei_map_files(directory, filename):
    """Read the Hubei map file."""
    file_path = os.path.join(directory, filename)
    return gpd.read_file(file_path)

def split_field(field, is_rru):
    if is_rru:
        match = re.match(r'(.+)\(gNB=(\d+),invRRU=(\d+)\)', field)
        return (match.group(2), match.group(3)) if match else (None, None)
    else:
        match = re.match(r'(.+)\(gNB=(\d+)\)', field)
        return (match.group(1), match.group(2)) if match else (None, None)

def process_rru_pl(RRU_PL):
    """Process RRU_PL DataFrame."""
    RRU_PL[['BBUID', 'RRUID']] = RRU_PL['对象'].apply(lambda x: pd.Series(split_field(x, is_rru=True)))
    RRU_PL = RRU_PL[['BBUID', 'RRUID', '开始时间', 'AAU功耗[千瓦时]']]
    RRU_PL['AAU功耗[千瓦时]'] = pd.to_numeric(RRU_PL['AAU功耗[千瓦时]'], errors='coerce').round(4).fillna(0)
    return RRU_PL

def process_bbu_pl(BBU_PL):
    """Process BBU_PL DataFrame."""
    BBU_PL[['BBU名称', 'BBUID']] = BBU_PL['对象'].apply(lambda x: pd.Series(split_field(x, is_rru=False)))
    BBU_PL['站型'] = BBU_PL['BBU名称'].apply(lambda x: '宏站' if 'D5H' in x else ('微站' if 'D5M' in x else ('室分' if 'D5S' in x else '未知')))
    BBU_PL = BBU_PL[['BBU名称', 'BBUID', '站型', '开始时间', 'BBU功耗[千瓦时]', 'gNB基站CPU平均负荷(R1056_001)[%]', 'gNB基站CPU峰值负荷(R1056_002)[%]', 'BBU功耗(R1054_001)[W]']]
    
    for col in ['BBU功耗[千瓦时]', 'BBU功耗(R1054_001)[W]']:
        BBU_PL[col] = pd.to_numeric(BBU_PL[col], errors='coerce').round(4).fillna(0)
    
    for col in ['gNB基站CPU平均负荷(R1056_001)[%]', 'gNB基站CPU峰值负荷(R1056_002)[%]']:
        BBU_PL[col] = pd.to_numeric(BBU_PL[col], errors='coerce').fillna(0).astype(int)
    
    return BBU_PL

def process_rf_ap_cp(df_ap, df_rf, df_cp, gdf):
    """Merge RF, AP, and CP data and perform spatial join."""
    df_rfap = pd.merge(df_ap, df_rf, on=['网元标识', '射频单元编号'], how='inner')
    df_rfap = df_rfap[['网元标识', '小区本地ID', '射频单元编号', 'Longitude', 'Latitude']]  # Select and rename columns

    df_rac = pd.merge(df_rfap, df_cp, on=['网元标识', '小区本地ID'], how='inner')
    df_rac = df_rac[['网元标识', '小区本地ID', 'BBU机房', '基站名称', '小区名称', '工作频段', 'Longitude', 'Latitude']]  # Select and rename columns

    geometry = [Point(xy) for xy in zip(df_rac['Longitude'], df_rac['Latitude'])]
    df_rac = gpd.GeoDataFrame(df_rac, geometry=geometry)
    # 设置 CRS
    if df_rac.crs is None:
        df_rac = df_rac.set_crs(gdf.crs)

    return gpd.sjoin(df_rac, gdf, how='inner', predicate='within')

def process_bs_pl(BBU_PL, RRU_PL):
    """Process BS_PL data."""
    antenna_count = RRU_PL.groupby(['BBUID', '开始时间'])['RRUID'].nunique().reset_index(name='天线数量')
    total_rru_power = RRU_PL.groupby(['BBUID', '开始时间'])['AAU功耗[千瓦时]'].sum().reset_index(name='RRU总功耗')

    BBU_PL = pd.merge(BBU_PL, antenna_count, on=['BBUID', '开始时间'], how='left')
    BBU_PL = pd.merge(BBU_PL, total_rru_power, on=['BBUID', '开始时间'], how='left')
    BBU_PL['频段'] = BBU_PL['BBU名称'].apply(lambda x: '700M' if '700M' in x else '2.6G')

    return BBU_PL[['BBU名称', 'BBUID', '天线数量', 'RRU总功耗', '频段']]

def save_dataframe_to_csv(df, filename):
    """Save DataFrame to CSV file."""
    df.to_csv(os.path.join(output_path, filename), index=False, encoding='utf-8-sig')
    print(f"DataFrame exported to {filename}")

def read_and_process_kpi_files(directory, keyword='DT_PowerBI指标通报计数器_', file_extension='.csv'):
    """Read and process KPI files."""
    def load_csv_files(directory, keyword, file_extension):
        files = [file for file in os.listdir(directory) if keyword in file and file.lower().endswith(file_extension)]
        dfs = []
        for file in files:
            file_path = os.path.join(directory, file)
            try:
                df = pd.read_csv(file_path, skiprows=2, header=0, encoding='cp936', na_values=["n/a", "na", "-"])
                df.columns = df.columns.str.replace(' ', '')  # 去除列名中的空格
                dfs.append(df)
            except (UnicodeDecodeError, Exception) as e:
                print(f"读取文件时发生错误：{e}")
        return dfs

    def process_dates(df, date_columns):
        for column in date_columns:
            df[column] = pd.to_datetime(df[column].astype(str).str.split().str[0], errors='coerce').dt.strftime('%Y-%m-%d')
        return df

    def split_columns(df):
        df[["小区名称", "Other"]] = df["对象"].str.split('\\(g', n=1, expand=True)
        df[["NB", "nrCellCfg"]] = df["Other"].str.split(',', n=1, expand=True)
        df["NB"] = df["NB"].str.lstrip('NB=')
        df["nrCellCfg"] = df["nrCellCfg"].str.lstrip('nrCellCfg=').str.rstrip(')')
        df.drop(columns=["对象", "Other"], inplace=True)
        return df

    def convert_column_types(df, int_columns, float_columns):
        df[int_columns] = df[int_columns].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
        df[float_columns] = df[float_columns].apply(pd.to_numeric, errors='coerce').fillna(0).round(2)
        return df

    def extract_code(column_name):
        match = re.search(r'\((.*?)\)', column_name)
        return match.group(1) if match else column_name

    dfs = load_csv_files(directory, keyword, file_extension)
    if not dfs:
        print("没有成功读取任何文件，请检查文件路径和过滤条件。")
        return pd.DataFrame()

    df_kpi = pd.concat(dfs, ignore_index=True).drop_duplicates()

    date_columns = ['开始时间', '结束时间']
    df_kpi = process_dates(df_kpi, date_columns)

    df_kpi = split_columns(df_kpi)

    df_kpi['NB'] = df_kpi['NB'].astype(str)
    df_kpi['nrCellCfg'] = df_kpi['nrCellCfg'].astype(str)

    df_kpi['ID'] = df_kpi['NB'] + '_' + df_kpi['nrCellCfg']

    new_order = df_kpi.columns[-3:].tolist() + df_kpi.columns[:-3].tolist()
    df_kpi = df_kpi[new_order]

    int_columns = df_kpi.loc[:, 'gNB请求释放的5QI为1的Flow数(R2035_003)[个]':'gNBRRC连接建立成功次数-moVideoCall(R1001_019)[次]'].columns
    float_columns = ['5QI为1的平均Flow数(K1009_001)[个]', '5QI为2的平均Flow数(K1009_002)[个]']
    df_kpi = convert_column_types(df_kpi, int_columns, float_columns)

    df_kpi = df_kpi.drop(columns=['小区名称', 'Nr小区工作频段', '小区下行系统带宽(MHz)', '逻辑小区id'])

    df_kpi['小区用户面RLCSDU上行尾包字节数(R1501_005)[0.01KByte]'] /= 100
    df_kpi['小区用户面RLCSDU下行尾包字节数(R1501_006)[0.01KByte]'] /= 100

    df_kpi.columns = df_kpi.columns.to_series().apply(extract_code)

    return df_kpi

# Read and process data
df_AP = read_and_process_files(Datadir, '天线安装规划', ['网元标识', '远端射频单元编号', '本地小区标识1'])
df_AP.columns = ['网元标识', '射频单元编号', '小区本地ID']  # Rename columns for df_AP

gdffile = '湖北省村级边界.geojson'
gdf = read_hubei_map_files(Datadir, gdffile)

df_RF = read_and_process_files(Datadir, '射频单元规划', ['网元标识', '射频单元编号', '射频单元RRU安装经度', '射频单元RRU安装纬度'])
df_RF.columns = ['网元标识', '射频单元编号', 'Longitude', 'Latitude']  # Rename columns for df_RF

df_CP = read_and_process_files(Datadir, 'NR小区', ['网元标识', '网元名称', '小区本地ID', '小区友好名', 'Nr小区工作频段'])
df_CP.rename(columns={'小区友好名': '小区名称', '网元名称': 'BBU机房'}, inplace=True)
df_CP['工作频段'] = df_CP['Nr小区工作频段'].str.split('(').str[0]
df_CP['基站名称'] = df_CP['小区名称'].str.replace(r'(-26.*|-07.*)', '', regex=True)
df_CP = df_CP[['网元标识', 'BBU机房', '基站名称', '小区本地ID', '小区名称', '工作频段']]

gdf_RAC = process_rf_ap_cp(df_AP, df_RF, df_CP, gdf)

# Read BBU and RRU power consumption
BBU_PL = read_and_process_files(Datadir, 'DT_BBU功耗_')
RRU_PL = read_and_process_files(Datadir, 'DT_RRU功耗_')

# Process RRU_PL and BBU_PL
RRU_PL = process_rru_pl(RRU_PL)
BBU_PL = process_bbu_pl(BBU_PL)

# Process BS_PL
BS_PL = process_bs_pl(BBU_PL, RRU_PL)

# Process KPI file
df_KPI = read_and_process_kpi_files(Datadir)

# Save results
save_dataframe_to_csv(gdf_RAC, 'gdf_RAC.csv')
save_dataframe_to_csv(df_KPI, 'df_KPI.csv')
save_dataframe_to_csv(BS_PL, 'BS_PL.csv')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  RRU_PL['AAU功耗[千瓦时]'] = pd.to_numeric(RRU_PL['AAU功耗[千瓦时]'], errors='coerce').round(4).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BBU_PL[col] = pd.to_numeric(BBU_PL[col], errors='coerce').round(4).fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  BBU_PL[col] = pd.to_numeric(BBU_P

DataFrame exported to gdf_RAC.csv
DataFrame exported to df_KPI.csv
DataFrame exported to BS_PL.csv
