## 这个文件主要把焕新发的数据和apmcm给出的数据合并

In [2]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import os

- 使用forest fire数据集中的latitude(纬度), longitude(经度), acq_date(日期), confidence(发生火灾的置信度), bright_t31(通道31的火焰亮度温度, 单位开尔文), type(火灾类型), Country(国家)
- 由于apmcm中给的数据是国家的数据而没有给具体城市, 所以我们要根据经纬度确定该点是否在apmcm给的100个城市之中, 若在则保留, 不在则舍去
- 1. 首先用K最近邻算法寻找forest fire各点距离最近的城市
- 2. 若该点与距离它最近的城市的经纬度之差在3度的范围内, 则认为该点属于该城市

In [3]:
df_all = []
for year in range(2000, 2013 + 1):
    print(f'processing {year}...')
    df_fire = []
    # reading forest fire file
    for i in os.listdir(f'../forest fire/{year}/'):
        country = i[11: -4]
        df = pd.read_csv(f'../forest fire/{year}/{i}')
        df['Country'] = country
        df = df[['latitude', 'longitude', 'acq_date', 'confidence', 'bright_t31', 'type', 'Country']]
        df_fire.append(df)
    df_fire = pd.concat(df_fire)
    df_fire.rename(columns={'acq_date': 'dt'}, inplace=True)
    df_fire['dt'] = pd.DatetimeIndex(df_fire['dt'])
    df_fire = df_fire[(df_fire['confidence']>=80)]
    df_fire['dt'] = pd.DatetimeIndex(df_fire['dt'].astype(str).map(lambda x: x[: -2] + '01'))

    # reading apmcm file
    apmcm = pd.read_csv('../data/2022_APMCM_C_Data.csv', encoding='gbk')
    apmcm['dt'] = pd.DatetimeIndex(apmcm['dt'])
    apmcm['Latitude'] = apmcm['Latitude'].map(lambda x: float(x[: -1]) if x[-1] == 'N' else float(x[: -1]))
    apmcm['Longitude'] = apmcm['Longitude'].map(lambda x: float(x[: -1]) if x[-1] == 'E' else - float(x[: -1]))
    apmcm[apmcm['dt']>='1900']
    apmcm = apmcm[apmcm['dt']!='2013-09-01']
    apmcm.drop(columns=['AverageTemperatureUncertainty'], inplace=True)
    data = apmcm[['Latitude', 'Longitude', 'City']].copy().drop_duplicates()

    # match position
    X = data.drop(columns=['City'])
    y = data['City']
    model = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
    model.fit(X, y)
    print('under matching...')
    df_fire['City'] = model.predict(df_fire[['latitude', 'longitude']].values)
    print('matching success!')
    df_fire.rename(columns={'latitude': 'Latitude_1', 'longitude': 'Longitude_1'}, inplace=True)

    # merge two Dataframe
    df_merge = pd.merge(left=apmcm, right=df_fire, on=['Country', 'City', 'dt'])
    df_merge = df_merge[((df_merge['Longitude'] - df_merge['Longitude_1'])**2 + (df_merge['Latitude'] - df_merge['Latitude_1'])**2)**0.5 <= 3]
    df_merge = df_merge[['dt', 'AverageTemperature', 'City', 'Country', 'Latitude', 'Longitude', 'bright_t31', 'type']]

    value = df_merge.groupby(['dt', 'City', 'Country', 'type']).mean().values
    index = df_merge.groupby(['dt', 'City', 'Country', 'type']).mean().index

    # generate DataFrame
    df = pd.DataFrame([[str(i[0])[: -9], i[1], i[2], i[3]] for i in index], columns=['dt', 'City', 'Country', 'type'])
    df[['AverageTemperature', 'Latitude', 'Longitude', 'bright_t31']] = value

    df_all.append(df)
df_all = pd.concat(df_all)
df_all.to_csv('../data/temperature_forestfire.csv', index=False)

processing 2000...
under matching...
matching success!
processing 2001...
under matching...
matching success!
processing 2002...
under matching...
matching success!
processing 2003...
under matching...
matching success!
processing 2004...
under matching...
matching success!
processing 2005...
under matching...
matching success!
processing 2006...
under matching...
matching success!
processing 2007...
under matching...
matching success!
processing 2008...
under matching...
matching success!
processing 2009...
under matching...
matching success!
processing 2010...
under matching...
matching success!
processing 2011...
under matching...
matching success!
processing 2012...
under matching...
matching success!
processing 2013...
under matching...
matching success!
