In [34]:
import pickle
import numpy as np
import pandas as pd
from os import path, getcwd, makedirs, environ, listdir

In [35]:
from sklearn.model_selection import train_test_split

<h2> 讀取PKL檔案 </h2>

In [41]:
def read_data_from_dataset(data_dir_path: str):
    """load train data and test data from the dataset

    Args:
        data_dir_path (str): path for the dataset we want to load

    Returns:
        tuple: tuple which contains X_train, y_train, X_test and y_test in order
    """
    data_list = []
    for fname in ['X_train', 'y_train', 'X_test', 'y_test']:
        with open(f'{data_dir_path}/{fname}.pkl', 'rb') as f:
            data = pickle.load(f)
            print(f'{fname}資料筆數: {len(data)}')
            data_list.append(data)
    return tuple(data_list)

In [7]:
def split_ratio(dataset):
    X_train, y_train, X_test, y_test = dataset
    print('X_train, y_train, X_test, y_test 各別筆數:', len(X_train), len(y_train), len(X_test), len(y_test))
    print('總共資料筆數', len(X_train) + len(X_test))
    print('切分比例:', round(len(X_train)/(len(X_train)+len(X_test)),2), round(len(X_test)/(len(X_train)+len(X_test)),2))

<h3> 有哪些資料集 </h3>

In [4]:
for source in listdir('dataset/source'):
    data_dir_path = path.join('dataset', 'source', source)
    print(data_dir_path)

dataset\source\air_quality
dataset\source\Appliance Energy Prediction
dataset\source\appliances
dataset\source\beijing_pm2.5
dataset\source\debutanizer
dataset\source\electricity
dataset\source\exchange_rate
dataset\source\metro_interstate
dataset\source\solar_energy
dataset\source\sru
dataset\source\traffic


<h3 style='color:blue'> 僅讀取有pkl檔案的資料 </h3>
if not path.exists(f'{data_dir_path}/X_train.pkl'): continue

<h3> 原本既有資料集 </h3>

<h4 style='color:blue'> air_quality </h4>

In [5]:
# air_quality
air_quality = r'dataset\source\air_quality'
air_quality = read_data_from_dataset(air_quality)
X_train_AirQuality, y_train_AirQuality, X_test_AirQuality, y_test_AirQuality = air_quality

<h5 style='color:blue'> 訓練資料 </h5>

In [6]:
# 訓練資料
X_train_AirQuality = pd.DataFrame(X_train_AirQuality) # 訓練資料的特徵
y_train_AirQuality = pd.DataFrame(y_train_AirQuality).rename(columns={0: "Answer"}) # 訓練資料的答案
pd.concat([X_train_AirQuality, y_train_AirQuality], axis=1) # 訓練資料

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,Answer
0,0.956111,0.696429,0.251980,0.516156,0.217987,0.435657,0.579630,0.635966,0.539111,0.873262,0.862141,0.992715,0.803565
1,0.953280,0.666071,0.224622,0.478459,0.180465,0.476587,0.540741,0.591261,0.430408,0.872036,0.857984,0.992556,0.794084
2,0.954224,0.715179,0.207343,0.471831,0.197141,0.464794,0.581481,0.589916,0.467866,0.866312,0.879806,0.992678,0.792567
3,0.954224,0.703571,0.201584,0.475559,0.221560,0.448144,0.596296,0.599664,0.515241,0.862633,0.900589,0.992858,0.793326
4,0.951392,0.657143,0.180706,0.429163,0.197141,0.487340,0.585185,0.568067,0.481087,0.863451,0.899203,0.992869,0.783087
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7480,0.948561,0.464732,0.000000,0.341756,0.221560,0.434964,0.577778,0.351597,0.325744,0.854456,0.829581,0.991220,0.768297
7481,0.948561,0.468750,0.000000,0.343413,0.201310,0.442595,0.548148,0.351933,0.292692,0.861406,0.811915,0.991147,0.768297
7482,0.947617,0.460268,0.000000,0.332229,0.191185,0.458897,0.535185,0.344202,0.266985,0.863042,0.808452,0.991144,0.766780
7483,0.948561,0.475893,0.000000,0.352941,0.204288,0.437045,0.551852,0.355294,0.283878,0.866312,0.799446,0.991087,0.769814


<h5 style='color:blue'> 測試資料 </h5>

In [7]:
# 測試資料
X_test_AirQuality = pd.DataFrame(X_test_AirQuality) # 測試資料的特徵
y_test_AirQuality = pd.DataFrame(y_test_AirQuality).rename(columns={0: "Answer"}) # 測試資料的答案
pd.concat([X_test_AirQuality, y_test_AirQuality], axis=1) # 測試資料

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,Answer
0,0.947145,0.461161,0.0,0.324772,0.182847,0.466181,0.518519,0.343193,0.243481,0.863859,0.805334,0.991111,0.766022
1,0.949033,0.480804,0.0,0.348799,0.223943,0.429067,0.577778,0.355966,0.280940,0.862633,0.807066,0.991110,0.769056
2,0.949033,0.487500,0.0,0.359569,0.226325,0.422130,0.594444,0.360336,0.305545,0.859771,0.813994,0.991132,0.770952
3,0.949504,0.490625,0.0,0.362055,0.245384,0.416927,0.614815,0.367395,0.327213,0.852003,0.834430,0.991214,0.771331
4,0.953280,0.539286,0.0,0.439519,0.316260,0.360388,0.666667,0.395630,0.419758,0.851594,0.826117,0.991059,0.785362
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1867,0.958471,0.675893,0.0,0.538940,0.400238,0.256330,0.722222,0.529076,0.708410,0.907195,0.794250,0.992710,0.809632
1868,0.955168,0.608482,0.0,0.508285,0.329363,0.278876,0.701852,0.492101,0.539479,0.917007,0.774853,0.992488,0.801669
1869,0.955168,0.599107,0.0,0.523198,0.293627,0.278529,0.694444,0.484370,0.474477,0.927637,0.756148,0.992136,0.805461
1870,0.953752,0.537054,0.0,0.480944,0.259083,0.312869,0.659259,0.417143,0.356225,0.933361,0.739522,0.991509,0.794463


In [8]:
split_ratio(air_quality)

X_train, y_train, X_test, y_test 各別筆數: 7485 7485 1872 1872
總共資料筆數 9357
切分比例: 0.8 0.2


<h4 style='color:blue'> appliances </h4>

In [11]:
# appliances
appliances = r'dataset\source\appliances'
appliances = read_data_from_dataset(appliances)
X_train_Appliances, y_train_Appliances, X_test_Appliances, y_test_Appliances = appliances

In [19]:
# 訓練資料
X_train_Appliances = pd.DataFrame(X_train_Appliances) # 訓練資料的特徵
y_train_Appliances = pd.DataFrame(y_train_Appliances).rename(columns={0: "Answer"}) # 訓練資料的答案
train_Appliances = pd.concat([X_train_Appliances, y_train_Appliances], axis=1) # 訓練資料
train_Appliances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,Answer
0,0.428571,0.327350,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.677290,0.372990,0.097674,0.894737,0.500000,0.953846,0.538462,0.265449,0.265449,0.046729
1,0.428571,0.327350,0.541326,0.225345,0.682140,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.678532,0.369239,0.100000,0.894737,0.476190,0.894872,0.533937,0.372083,0.372083,0.046729
2,0.428571,0.327350,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848,0.037383
3,0.571429,0.327350,0.524080,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261,0.037383
4,0.571429,0.327350,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611,0.046729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15783,0.000000,0.475185,0.281901,0.195542,0.560596,0.464440,0.394142,0.377477,0.423816,0.360248,...,0.511589,0.324759,0.663953,0.947368,0.071429,0.600000,0.500000,0.924246,0.924246,0.065421
15784,0.000000,0.482225,0.292725,0.202811,0.565939,0.459455,0.381368,0.372072,0.429506,0.353878,...,0.509106,0.327974,0.665891,0.942982,0.071429,0.600000,0.502262,0.653042,0.653042,0.065421
15785,0.000000,0.485744,0.300706,0.208142,0.564252,0.448654,0.358311,0.377477,0.465500,0.360248,...,0.506347,0.331190,0.667829,0.938596,0.071429,0.600000,0.504525,0.131351,0.131351,0.037383
15786,0.000000,0.485744,0.302082,0.210807,0.557503,0.448654,0.364231,0.377477,0.471191,0.366619,...,0.496551,0.334405,0.669767,0.934211,0.071429,0.600000,0.506787,0.063452,0.063452,0.056075


In [20]:
# 測試資料
X_test_Appliances = pd.DataFrame(X_test_Appliances) # 測試資料的特徵
y_test_Appliances = pd.DataFrame(y_test_Appliances).rename(columns={0: "Answer"}) # 測試資料的答案
test_Appliances = pd.concat([X_test_Appliances, y_test_Appliances], axis=1) # 測試資料
test_Appliances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,Answer
0,0.000000,0.485744,0.309238,0.220499,0.559565,0.448654,0.361427,0.377477,0.472471,0.372671,...,0.479857,0.342980,0.672868,0.903509,0.071429,0.600000,0.503771,0.595427,0.595427,0.336449
1,0.000000,0.485744,0.305568,0.225345,0.555816,0.448654,0.363530,0.383483,0.483568,0.369804,...,0.471854,0.347267,0.674419,0.888158,0.071429,0.600000,0.502262,0.543474,0.543474,0.542056
2,0.000000,0.485744,0.304559,0.229949,0.550098,0.451424,0.365633,0.386486,0.497937,0.369804,...,0.463438,0.351554,0.675969,0.872807,0.071429,0.600000,0.500754,0.606712,0.606712,0.289720
3,0.142857,0.485744,0.302908,0.249334,0.542788,0.456962,0.365633,0.393093,0.503912,0.369804,...,0.460403,0.355841,0.677519,0.857456,0.071429,0.600000,0.499246,0.655702,0.655702,0.280374
4,0.000000,0.478705,0.299330,0.263872,0.529572,0.456962,0.365633,0.396396,0.505335,0.369804,...,0.464956,0.360129,0.679070,0.842105,0.071429,0.600000,0.497738,0.351205,0.351205,0.233645
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3942,0.000000,0.926786,0.537657,0.711655,0.606309,0.830841,0.579374,0.864865,0.765258,0.752031,...,0.729443,0.891747,0.602326,0.416667,0.238095,0.348718,0.901961,0.861981,0.861981,0.084112
3943,0.000000,0.919747,0.536006,0.701769,0.607836,0.825302,0.582178,0.864865,0.765258,0.754897,...,0.729443,0.887460,0.602326,0.421053,0.250000,0.361538,0.900452,0.985726,0.985726,0.074766
3944,0.142857,0.919747,0.538666,0.692651,0.627198,0.818378,0.603988,0.864865,0.771233,0.754897,...,0.729443,0.883173,0.602326,0.425439,0.261905,0.374359,0.898944,0.583979,0.583979,0.242991
3945,0.142857,0.919747,0.549491,0.677054,0.634717,0.805085,0.585294,0.864865,0.773794,0.752031,...,0.730581,0.878885,0.602326,0.429825,0.273810,0.387179,0.897436,0.126371,0.126371,0.383178


In [10]:
split_ratio(appliances)

X_train, y_train, X_test, y_test 各別筆數: 15788 15788 3947 3947
總共資料筆數 19735
切分比例: 0.8 0.2


<h4 style='color:blue'> beijing_pm2.5 </h4>

In [102]:
# beijing_pm2.5
beijing = r'dataset\source\beijing_pm2.5'
beijing = read_data_from_dataset(beijing)
X_train_Beijing, y_train_Beijing, X_test_Beijing, y_test_Beijing = beijing

In [103]:
# 訓練資料
X_train_Beijing = pd.DataFrame(X_train_Beijing) # 訓練資料的特徵
y_train_Beijing = pd.DataFrame(y_train_Beijing).rename(columns={0: "Answer"}) # 訓練資料的答案
train_Beijing = pd.concat([X_train_Beijing, y_train_Beijing], axis=1) # 訓練資料
train_Beijing

Unnamed: 0,0,1,2,3,4,5,Answer
0,0.352941,0.245902,0.527273,0.002290,0.000000,0.0,0.129779
1,0.367647,0.245902,0.527273,0.003811,0.000000,0.0,0.148893
2,0.426471,0.229508,0.545455,0.005332,0.000000,0.0,0.159960
3,0.485294,0.229508,0.563636,0.008391,0.037037,0.0,0.182093
4,0.485294,0.229508,0.563636,0.009912,0.074074,0.0,0.138833
...,...,...,...,...,...,...,...
35035,0.308824,0.426230,0.400000,0.195540,0.000000,0.0,0.022133
35036,0.279412,0.426230,0.418182,0.203948,0.000000,0.0,0.018109
35037,0.279412,0.426230,0.418182,0.213877,0.000000,0.0,0.023139
35038,0.279412,0.409836,0.418182,0.222285,0.000000,0.0,0.020121


In [104]:
# 測試資料
X_test_Beijing = pd.DataFrame(X_test_Beijing) # 測試資料的特徵
y_test_Beijing = pd.DataFrame(y_test_Beijing).rename(columns={0: "Answer"}) # 測試資料的答案
test_Beijing = pd.concat([X_test_Beijing, y_test_Beijing], axis=1) # 測試資料
test_Beijing

Unnamed: 0,0,1,2,3,4,5,Answer
0,0.294118,0.426230,0.418182,0.244433,0.0,0.0,0.024145
1,0.294118,0.426230,0.400000,0.251303,0.0,0.0,0.053320
2,0.294118,0.409836,0.400000,0.258173,0.0,0.0,0.065392
3,0.294118,0.409836,0.400000,0.261232,0.0,0.0,0.070423
4,0.323529,0.360656,0.381818,0.000752,0.0,0.0,0.079477
...,...,...,...,...,...,...,...
8755,0.250000,0.278689,0.781818,0.395659,0.0,0.0,0.008048
8756,0.264706,0.262295,0.781818,0.405588,0.0,0.0,0.010060
8757,0.264706,0.262295,0.781818,0.413996,0.0,0.0,0.010060
8758,0.264706,0.245902,0.781818,0.420866,0.0,0.0,0.008048


In [105]:
split_ratio(beijing)

X_train, y_train, X_test, y_test 各別筆數: 35040 35040 8760 8760
總共資料筆數 43800
切分比例: 0.8 0.2


<h4 style='color:blue'> electricity </h4>

In [142]:
# beijing_pm2.5
electricity = r'dataset\source\electricity'
electricity = read_data_from_dataset(electricity)
X_train_Electricity, y_train_Electricity, X_test_Electricity, y_test_Electricity = electricity

In [143]:
# 訓練資料
X_train_Electricity = pd.DataFrame(X_train_Electricity) # 訓練資料的特徵
y_train_Electricity = pd.DataFrame(y_train_Electricity).rename(columns={0: "Answer"}) # 訓練資料的答案
train_Electricity = pd.concat([X_train_Electricity, y_train_Electricity], axis=1) # 訓練資料
train_Electricity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Answer
0,0.174265,0.371845,0.125906,0.266300,0.120671,0.303012,0.165162,0.198269,0.102060,0.080025,0.104843
1,0.183098,0.351736,0.166558,0.236991,0.105973,0.290789,0.162502,0.200275,0.136878,0.106085,0.124607
2,0.180630,0.354899,0.165400,0.252239,0.102229,0.285650,0.149972,0.193273,0.135655,0.104418,0.126440
3,0.178782,0.338995,0.160214,0.254828,0.101976,0.288926,0.147767,0.179819,0.136827,0.104199,0.123691
4,0.183910,0.324170,0.158528,0.251004,0.102608,0.285978,0.144057,0.189300,0.137235,0.103848,0.119503
...,...,...,...,...,...,...,...,...,...,...,...
21038,0.389372,0.639169,0.645363,0.661189,0.402287,0.551481,0.426851,0.540401,0.524572,0.326679,0.315052
21039,0.374589,0.652215,0.691678,0.661140,0.403299,0.541122,0.403787,0.527970,0.542669,0.321414,0.321859
21040,0.382599,0.650119,0.679672,0.664989,0.400213,0.556620,0.315659,0.564241,0.536093,0.305050,0.325000
21041,0.394092,0.648683,0.675644,0.658623,0.398366,0.571771,0.303374,0.554799,0.523807,0.297416,0.337435


In [144]:
# 測試資料
X_test_Electricity = pd.DataFrame(X_test_Electricity) # 測試資料的特徵
y_test_Electricity = pd.DataFrame(y_test_Electricity).rename(columns={0: "Answer"}) # 測試資料的答案
test_Electricity = pd.concat([X_test_Electricity, y_test_Electricity], axis=1) # 測試資料
test_Electricity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Answer
0,0.382604,0.601733,0.627039,0.640738,0.394445,0.563151,0.359513,0.552321,0.521462,0.291405,0.312565
1,0.370284,0.628219,0.554370,0.629314,0.419489,0.551788,0.409597,0.515460,0.516823,0.300399,0.285602
2,0.372131,0.629248,0.520061,0.626797,0.382302,0.561268,0.415477,0.484579,0.534258,0.342868,0.313613
3,0.358378,0.585828,0.314136,0.616584,0.334160,0.526503,0.352898,0.381550,0.389784,0.261001,0.264005
4,0.340722,0.494722,0.220877,0.601288,0.308381,0.433266,0.280519,0.205271,0.254231,0.179485,0.188351
...,...,...,...,...,...,...,...,...,...,...,...
5256,0.315890,0.626097,0.262082,0.508301,0.268815,0.422783,0.347088,0.398466,0.280485,0.212258,0.217932
5257,0.216960,0.626440,0.225433,0.333801,0.199449,0.278402,0.238415,0.189811,0.200907,0.121792,0.207853
5258,0.208748,0.627152,0.148762,0.300668,0.186496,0.253465,0.231065,0.169866,0.175061,0.116659,0.201963
5259,0.193561,0.543464,0.138492,0.282807,0.180450,0.246095,0.229105,0.187844,0.170779,0.112052,0.185733


In [145]:
split_ratio(electricity)

X_train, y_train, X_test, y_test 各別筆數: 21043 21043 5261 5261
總共資料筆數 26304
切分比例: 0.8 0.2


<h4 style='color:blue'> solar_energy </h4>

In [86]:
# solar_energy
solar_energy = r'dataset\source\solar_energy'
solar_energy = read_data_from_dataset(solar_energy)
X_train_SolarEnergy, y_train_SolarEnergy, X_test_SolarEnergy, y_test_SolarEnergy = solar_energy

In [87]:
# 訓練資料
X_train_SolarEnergy = pd.DataFrame(X_train_SolarEnergy) # 訓練資料的特徵
y_train_SolarEnergy = pd.DataFrame(y_train_SolarEnergy).rename(columns={0: "Answer"}) # 訓練資料的答案
train_SolarEnergy = pd.concat([X_train_SolarEnergy, y_train_SolarEnergy], axis=1) # 訓練資料
train_SolarEnergy

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Answer
0,0.128234,0.062616,0.259825,0.000704,0.008904,0.004248,0.000840,0.054393,0.000000,0.085567,0.099631
1,0.177728,0.174830,0.336245,0.000000,0.115753,0.003398,0.109152,0.078661,0.000000,0.077320,0.284133
2,0.241845,0.210787,0.395924,0.003521,0.173973,0.003398,0.264484,0.109623,0.000000,0.108247,0.307503
3,0.302025,0.161810,0.449054,0.016197,0.156164,0.002549,0.163728,0.117992,0.001603,0.085567,0.134686
4,0.295838,0.331060,0.462154,0.101408,0.154795,0.016992,0.179681,0.112134,0.002404,0.007216,0.127306
...,...,...,...,...,...,...,...,...,...,...,...
42007,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
42008,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
42009,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
42010,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [88]:
# 測試資料
X_test_SolarEnergy = pd.DataFrame(X_test_SolarEnergy) # 測試資料的特徵
y_test_SolarEnergy = pd.DataFrame(y_test_SolarEnergy).rename(columns={0: "Answer"}) # 測試資料的答案
test_SolarEnergy = pd.concat([X_test_SolarEnergy, y_test_SolarEnergy], axis=1) # 測試資料
test_SolarEnergy

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,Answer
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
10498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10501,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
split_ratio(solar_energy)

X_train, y_train, X_test, y_test 各別筆數: 42012 42012 10503 10503
總共資料筆數 52515
切分比例: 0.8 0.2


<h4 style='color:blue'> debutanizer </h4>

In [45]:
# debutanizer 
debutanizer = r'dataset\target\debutanizer' # source
debutanizer = read_data_from_dataset(debutanizer)
X_train_Debutanizer, y_train_Debutanizer, X_test_Debutanizer, y_test_Debutanizer = debutanizer

X_train資料筆數: 478
y_train資料筆數: 478
X_test資料筆數: 1916
y_test資料筆數: 1916


In [46]:
# 訓練資料
X_train_Debutanizer = pd.DataFrame(X_train_Debutanizer) # 訓練資料的特徵
y_train_Debutanizer = pd.DataFrame(y_train_Debutanizer).rename(columns={0: "Answer"}) # 訓練資料的答案
train_Debutanizer = pd.concat([X_train_Debutanizer, y_train_Debutanizer], axis=1) # 訓練資料
train_Debutanizer

Unnamed: 0,0,1,2,3,4,5,6,Answer
0,0.268900,0.650894,0.832742,0.583420,0.784759,0.843079,0.822079,0.180295
1,0.268483,0.650140,0.852153,0.577510,0.776487,0.838605,0.822079,0.177124
2,0.267967,0.659657,0.823618,0.571600,0.764546,0.807879,0.786246,0.173618
3,0.267451,0.668338,0.808371,0.565689,0.752605,0.799606,0.786246,0.171640
4,0.266935,0.647191,0.761948,0.559779,0.745326,0.773122,0.746142,0.166972
...,...,...,...,...,...,...,...,...
473,0.196429,0.611174,0.503364,0.437757,0.723388,0.840954,0.835447,0.250596
474,0.198702,0.636696,0.455499,0.439339,0.714695,0.803623,0.817006,0.249726
475,0.200975,0.634573,0.455835,0.440922,0.704844,0.818969,0.795798,0.246283
476,0.203248,0.625481,0.441601,0.442504,0.730797,0.817057,0.802483,0.246230


In [47]:
# 測試資料
X_test_Debutanizer = pd.DataFrame(X_test_Debutanizer) # 測試資料的特徵
y_test_Debutanizer = pd.DataFrame(y_test_Debutanizer).rename(columns={0: "Answer"}) # 測試資料的答案
test_Debutanizer = pd.concat([X_test_Debutanizer, y_test_Debutanizer], axis=1) # 測試資料
test_Debutanizer

Unnamed: 0,0,1,2,3,4,5,6,Answer
0,0.207795,0.637875,0.435865,0.445669,0.717564,0.809187,0.782943,0.245026
1,0.210069,0.640077,0.399371,0.447252,0.706889,0.793467,0.785694,0.243979
2,0.212342,0.627126,0.320429,0.448834,0.696216,0.785824,0.771342,0.242752
3,0.219852,0.647911,0.312270,0.450417,0.725619,0.808111,0.814571,0.242737
4,0.227559,0.651272,0.326562,0.451999,0.732658,0.851886,0.827040,0.243951
...,...,...,...,...,...,...,...,...
1911,0.284859,0.645894,0.688723,0.355109,0.615583,0.608848,0.500866,0.188976
1912,0.266139,0.665503,0.665640,0.354275,0.588834,0.582074,0.508999,0.178992
1913,0.247419,0.665706,0.664740,0.353441,0.561301,0.602186,0.523348,0.169742
1914,0.228699,0.672975,0.677300,0.352607,0.530730,0.636970,0.537697,0.159023


In [48]:
split_ratio(debutanizer)

X_train, y_train, X_test, y_test 各別筆數: 478 478 1916 1916
總共資料筆數 2394
切分比例: 0.2 0.8


In [49]:
# 將訓練集分割為訓練和驗證
X_train, X_valid, y_train, y_valid = \
    train_test_split(X_train_Debutanizer, y_train_Debutanizer, test_size=0.2, shuffle=False) 

In [50]:
X_train # Training

Unnamed: 0,0,1,2,3,4,5,6
0,0.268900,0.650894,0.832742,0.583420,0.784759,0.843079,0.822079
1,0.268483,0.650140,0.852153,0.577510,0.776487,0.838605,0.822079
2,0.267967,0.659657,0.823618,0.571600,0.764546,0.807879,0.786246
3,0.267451,0.668338,0.808371,0.565689,0.752605,0.799606,0.786246
4,0.266935,0.647191,0.761948,0.559779,0.745326,0.773122,0.746142
...,...,...,...,...,...,...,...
377,0.219634,0.646460,0.674337,0.445304,0.835481,0.953033,0.926452
378,0.220578,0.643331,0.732184,0.436429,0.841387,0.934879,0.945021
379,0.221523,0.638960,0.743512,0.427553,0.843713,0.899581,0.894296
380,0.222468,0.632402,0.744922,0.418677,0.839998,0.876294,0.859601


In [51]:
X_valid # Validation

Unnamed: 0,0,1,2,3,4,5,6
382,0.224357,0.638603,0.756649,0.400926,0.832569,0.875552,0.881615
383,0.226040,0.635329,0.751415,0.392050,0.828853,0.908250,0.905871
384,0.227903,0.628817,0.743882,0.383174,0.825139,0.917577,0.890895
385,0.229767,0.628109,0.708948,0.374299,0.821562,0.880028,0.882618
386,0.231631,0.633429,0.707474,0.365423,0.818218,0.897615,0.880670
...,...,...,...,...,...,...,...
473,0.196429,0.611174,0.503364,0.437757,0.723388,0.840954,0.835447
474,0.198702,0.636696,0.455499,0.439339,0.714695,0.803623,0.817006
475,0.200975,0.634573,0.455835,0.440922,0.704844,0.818969,0.795798
476,0.203248,0.625481,0.441601,0.442504,0.730797,0.817057,0.802483


<h4 style='color:blue'> sru </h4>

In [52]:
# debutanizer 
sru = r'dataset\target\sru' # source
sru = read_data_from_dataset(sru)
X_train_SRU, y_train_SRU, X_test_SRU, y_test_SRU = sru

X_train資料筆數: 2016
y_train資料筆數: 2016
X_test資料筆數: 8065
y_test資料筆數: 8065


In [53]:
# 訓練資料
X_train_SRU = pd.DataFrame(X_train_SRU) # 訓練資料的特徵
y_train_SRU = pd.DataFrame(y_train_SRU).rename(columns={0: "Answer"}) # 訓練資料的答案
train_SRU = pd.concat([X_train_SRU, y_train_SRU], axis=1) # 訓練資料
train_SRU

Unnamed: 0,0,1,2,3,4,Answer
0,0.663113,0.774276,0.470970,0.096044,0.019865,0.083362
1,0.659960,0.766021,0.473398,0.111023,0.027957,0.081265
2,0.654670,0.767519,0.474198,0.105410,0.028907,0.079168
3,0.653325,0.769922,0.475963,0.066221,0.029411,0.077070
4,0.654726,0.760449,0.478833,0.033746,0.031592,0.074973
...,...,...,...,...,...,...
2011,0.662885,0.422820,0.464484,0.725476,0.799390,0.089478
2012,0.664785,0.424840,0.476833,0.753070,0.793093,0.091333
2013,0.663860,0.429281,0.486019,0.737167,0.790185,0.095357
2014,0.662936,0.418727,0.485691,0.744798,0.797760,0.099382


In [54]:
# 測試資料
X_test_SRU = pd.DataFrame(X_test_SRU) # 測試資料的特徵
y_test_SRU = pd.DataFrame(y_test_SRU).rename(columns={0: "Answer"}) # 測試資料的答案
test_SRU = pd.concat([X_test_SRU, y_test_SRU], axis=1) # 測試資料
test_SRU

Unnamed: 0,0,1,2,3,4,Answer
0,0.656732,0.424892,0.490225,0.721178,0.806157,0.107432
1,0.649930,0.421113,0.485568,0.734921,0.794993,0.087466
2,0.649503,0.429524,0.475420,0.788375,0.872343,0.061106
3,0.656407,0.411291,0.464897,0.895316,0.941413,0.067412
4,0.662235,0.412981,0.468915,0.941795,0.928526,0.073717
...,...,...,...,...,...,...
8060,0.553956,0.605635,0.598424,0.059032,0.023336,0.094734
8061,0.552458,0.596963,0.600553,0.058193,0.027241,0.077845
8062,0.553635,0.606105,0.599118,0.075588,0.024404,0.078403
8063,0.554811,0.607081,0.594603,0.066097,0.025846,0.078961


In [55]:
split_ratio(sru)

X_train, y_train, X_test, y_test 各別筆數: 2016 2016 8065 8065
總共資料筆數 10081
切分比例: 0.2 0.8


In [59]:
X_train, X_valid, y_train, y_valid = \
    train_test_split(X_train_SRU, y_train_SRU, test_size=0.2, shuffle=False) # 將訓練集分割為訓練和驗證

In [60]:
X_train # Training

Unnamed: 0,0,1,2,3,4
0,0.663113,0.774276,0.470970,0.096044,0.019865
1,0.659960,0.766021,0.473398,0.111023,0.027957
2,0.654670,0.767519,0.474198,0.105410,0.028907
3,0.653325,0.769922,0.475963,0.066221,0.029411
4,0.654726,0.760449,0.478833,0.033746,0.031592
...,...,...,...,...,...
1607,0.418954,0.179106,0.769023,0.067990,0.408690
1608,0.437166,0.173934,0.772076,0.063965,0.414541
1609,0.452823,0.195145,0.782678,0.049995,0.410906
1610,0.447477,0.190234,0.793597,0.066550,0.416406


In [61]:
X_valid # Validation

Unnamed: 0,0,1,2,3,4
1612,0.406424,0.185497,0.803808,0.082471,0.416787
1613,0.388447,0.169511,0.811876,0.082697,0.418282
1614,0.378685,0.151296,0.792709,0.094762,0.422117
1615,0.370823,0.157704,0.755650,0.107950,0.424234
1616,0.360316,0.169302,0.738444,0.112883,0.421331
...,...,...,...,...,...
2011,0.662885,0.422820,0.464484,0.725476,0.799390
2012,0.664785,0.424840,0.476833,0.753070,0.793093
2013,0.663860,0.429281,0.486019,0.737167,0.790185
2014,0.662936,0.418727,0.485691,0.744798,0.797760


<h2 style='color:purple'> Experiment & Note </h2>

<h3> 探討Appliance Energy Prediction能源資料集用的是什麼Scaler方法。 </h3>

In [43]:
from sklearn.preprocessing import MinMaxScaler

In [23]:
df_appliances = pd.concat([train_Appliances, test_Appliances], ignore_index=True)
df_appliances

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,Answer
0,0.428571,0.327350,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.677290,0.372990,0.097674,0.894737,0.500000,0.953846,0.538462,0.265449,0.265449,0.046729
1,0.428571,0.327350,0.541326,0.225345,0.682140,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.678532,0.369239,0.100000,0.894737,0.476190,0.894872,0.533937,0.372083,0.372083,0.046729
2,0.428571,0.327350,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848,0.037383
3,0.571429,0.327350,0.524080,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261,0.037383
4,0.571429,0.327350,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611,0.046729
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,0.000000,0.926786,0.537657,0.711655,0.606309,0.830841,0.579374,0.864865,0.765258,0.752031,...,0.729443,0.891747,0.602326,0.416667,0.238095,0.348718,0.901961,0.861981,0.861981,0.084112
19731,0.000000,0.919747,0.536006,0.701769,0.607836,0.825302,0.582178,0.864865,0.765258,0.754897,...,0.729443,0.887460,0.602326,0.421053,0.250000,0.361538,0.900452,0.985726,0.985726,0.074766
19732,0.142857,0.919747,0.538666,0.692651,0.627198,0.818378,0.603988,0.864865,0.771233,0.754897,...,0.729443,0.883173,0.602326,0.425439,0.261905,0.374359,0.898944,0.583979,0.583979,0.242991
19733,0.142857,0.919747,0.549491,0.677054,0.634717,0.805085,0.585294,0.864865,0.773794,0.752031,...,0.730581,0.878885,0.602326,0.429825,0.273810,0.387179,0.897436,0.126371,0.126371,0.383178


In [32]:
ApplianceEnergyPrediction = pd.read_csv(r'.\dataset\source\Appliance Energy Prediction\KAG_energydata_complete.csv')
# ApplianceEnergyPrediction

In [33]:
# remove time column 
ApplianceEnergyPrediction = ApplianceEnergyPrediction.iloc[:,1:]
# ApplianceEnergyPrediction

In [38]:
# scaling
ApplianceEnergyPrediction[ApplianceEnergyPrediction.columns.tolist()]=MinMaxScaler().fit_transform(ApplianceEnergyPrediction[ApplianceEnergyPrediction.columns.tolist()])
# ApplianceEnergyPrediction

In [39]:
train_ApplianceEnergyPrediction = ApplianceEnergyPrediction.drop(['Appliances'], axis=1)
train_ApplianceEnergyPrediction

Unnamed: 0,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.428571,0.327350,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,0.175506,...,0.223032,0.677290,0.372990,0.097674,0.894737,0.500000,0.953846,0.538462,0.265449,0.265449
1,0.428571,0.327350,0.541326,0.225345,0.682140,0.215188,0.748871,0.351351,0.782437,0.175506,...,0.226500,0.678532,0.369239,0.100000,0.894737,0.476190,0.894872,0.533937,0.372083,0.372083
2,0.428571,0.327350,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,0.175506,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.571429,0.327350,0.524080,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,0.175506,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.571429,0.327350,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,0.178691,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,0.000000,0.926786,0.537657,0.711655,0.606309,0.830841,0.579374,0.864865,0.765258,0.752031,...,0.864724,0.729443,0.891747,0.602326,0.416667,0.238095,0.348718,0.901961,0.861981,0.861981
19731,0.000000,0.919747,0.536006,0.701769,0.607836,0.825302,0.582178,0.864865,0.765258,0.754897,...,0.864724,0.729443,0.887460,0.602326,0.421053,0.250000,0.361538,0.900452,0.985726,0.985726
19732,0.142857,0.919747,0.538666,0.692651,0.627198,0.818378,0.603988,0.864865,0.771233,0.754897,...,0.864724,0.729443,0.883173,0.602326,0.425439,0.261905,0.374359,0.898944,0.583979,0.583979
19733,0.142857,0.919747,0.549491,0.677054,0.634717,0.805085,0.585294,0.864865,0.773794,0.752031,...,0.864724,0.730581,0.878885,0.602326,0.429825,0.273810,0.387179,0.897436,0.126371,0.126371


<h3> 讀取原始資料集 </h3>

<h3> DataSet: Air Quality </h3>

In [178]:
df = pd.read_csv(r'.\preprocess\source\air_quality\AirQualityUCI.csv',sep=';')
df

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578,,
1,10/03/2004,19.00.00,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255,,
2,10/03/2004,20.00.00,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502,,
3,10/03/2004,21.00.00,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867,,
4,10/03/2004,22.00.00,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9466,,,,,,,,,,,,,,,,,
9467,,,,,,,,,,,,,,,,,
9468,,,,,,,,,,,,,,,,,
9469,,,,,,,,,,,,,,,,,


In [179]:
df = df.iloc[:-114,2:-2] # remove NaN and time column 
df

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,26,1360.0,150.0,119,1046.0,166.0,1056.0,113.0,1692.0,1268.0,136,489,07578
1,2,1292.0,112.0,94,955.0,103.0,1174.0,92.0,1559.0,972.0,133,477,07255
2,22,1402.0,88.0,90,939.0,131.0,1140.0,114.0,1555.0,1074.0,119,540,07502
3,22,1376.0,80.0,92,948.0,172.0,1092.0,122.0,1584.0,1203.0,110,600,07867
4,16,1272.0,51.0,65,836.0,131.0,1205.0,116.0,1490.0,1110.0,112,596,07888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,31,1314.0,-200.0,135,1101.0,472.0,539.0,190.0,1374.0,1729.0,219,293,07568
9353,24,1163.0,-200.0,114,1027.0,353.0,604.0,179.0,1264.0,1269.0,243,237,07119
9354,24,1142.0,-200.0,124,1063.0,293.0,603.0,175.0,1241.0,1092.0,269,183,06406
9355,21,1003.0,-200.0,95,961.0,235.0,702.0,156.0,1041.0,770.0,283,135,05139


In [97]:
# convert ',' to '.' in float value
for n_column in df.columns:
    feature = df[n_column].tolist()
    if type(feature[0]) == float or ',' not in feature[0]:
        continue
    df[n_column] = list(map(lambda x:float(str(x).replace(',','.')), feature))
df

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH
0,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9352,3.1,1314.0,-200.0,13.5,1101.0,472.0,539.0,190.0,1374.0,1729.0,21.9,29.3,0.7568
9353,2.4,1163.0,-200.0,11.4,1027.0,353.0,604.0,179.0,1264.0,1269.0,24.3,23.7,0.7119
9354,2.4,1142.0,-200.0,12.4,1063.0,293.0,603.0,175.0,1241.0,1092.0,26.9,18.3,0.6406
9355,2.1,1003.0,-200.0,9.5,961.0,235.0,702.0,156.0,1041.0,770.0,28.3,13.5,0.5139


In [99]:
X = df.drop(['C6H6(GT)'], axis=1).values
X.shape

(9357, 12)

<h3> DataSet: appliances </h3>

In [139]:
df = pd.read_csv(r'.\dataset\source\Appliance Energy Prediction\KAG_energydata_complete.csv')
df

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,2016-05-27 17:20:00,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,2016-05-27 17:30:00,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,2016-05-27 17:40:00,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,2016-05-27 17:50:00,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [140]:
# remove time column 
df = df.iloc[:,1:]
df

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,60,30,19.890000,47.596667,19.200000,44.790000,19.790000,44.730000,19.000000,45.566667,...,17.033333,45.5300,6.600000,733.5,92.000000,7.000000,63.000000,5.300000,13.275433,13.275433
1,60,30,19.890000,46.693333,19.200000,44.722500,19.790000,44.790000,19.000000,45.992500,...,17.066667,45.5600,6.483333,733.6,92.000000,6.666667,59.166667,5.200000,18.606195,18.606195
2,50,30,19.890000,46.300000,19.200000,44.626667,19.790000,44.933333,18.926667,45.890000,...,17.000000,45.5000,6.366667,733.7,92.000000,6.333333,55.333333,5.100000,28.642668,28.642668
3,50,40,19.890000,46.066667,19.200000,44.590000,19.790000,45.000000,18.890000,45.723333,...,17.000000,45.4000,6.250000,733.8,92.000000,6.000000,51.500000,5.000000,45.410389,45.410389
4,60,40,19.890000,46.333333,19.200000,44.530000,19.790000,45.000000,18.890000,45.530000,...,17.000000,45.4000,6.133333,733.9,92.000000,5.666667,47.666667,4.900000,10.084097,10.084097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,100,0,25.566667,46.560000,25.890000,42.025714,27.200000,41.163333,24.700000,45.590000,...,23.200000,46.7900,22.733333,755.2,55.666667,3.333333,23.666667,13.333333,43.096812,43.096812
19731,90,0,25.500000,46.500000,25.754000,42.080000,27.133333,41.223333,24.700000,45.590000,...,23.200000,46.7900,22.600000,755.2,56.000000,3.500000,24.500000,13.300000,49.282940,49.282940
19732,270,10,25.500000,46.596667,25.628571,42.768571,27.050000,41.690000,24.700000,45.730000,...,23.200000,46.7900,22.466667,755.2,56.333333,3.666667,25.333333,13.266667,29.199117,29.199117
19733,420,10,25.500000,46.990000,25.414000,43.036000,26.890000,41.290000,24.700000,45.790000,...,23.200000,46.8175,22.333333,755.2,56.666667,3.833333,26.166667,13.233333,6.322784,6.322784


In [141]:
# scaling
df[df.columns.tolist()]=MinMaxScaler().fit_transform(df[df.columns.tolist()])
df

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,0.046729,0.428571,0.327350,0.566187,0.225345,0.684038,0.215188,0.746066,0.351351,0.764262,...,0.223032,0.677290,0.372990,0.097674,0.894737,0.500000,0.953846,0.538462,0.265449,0.265449
1,0.046729,0.428571,0.327350,0.541326,0.225345,0.682140,0.215188,0.748871,0.351351,0.782437,...,0.226500,0.678532,0.369239,0.100000,0.894737,0.476190,0.894872,0.533937,0.372083,0.372083
2,0.037383,0.428571,0.327350,0.530502,0.225345,0.679445,0.215188,0.755569,0.344745,0.778062,...,0.219563,0.676049,0.365488,0.102326,0.894737,0.452381,0.835897,0.529412,0.572848,0.572848
3,0.037383,0.571429,0.327350,0.524080,0.225345,0.678414,0.215188,0.758685,0.341441,0.770949,...,0.219563,0.671909,0.361736,0.104651,0.894737,0.428571,0.776923,0.524887,0.908261,0.908261
4,0.046729,0.571429,0.327350,0.531419,0.225345,0.676727,0.215188,0.758685,0.341441,0.762697,...,0.219563,0.671909,0.357985,0.106977,0.894737,0.404762,0.717949,0.520362,0.201611,0.201611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19730,0.084112,0.000000,0.926786,0.537657,0.711655,0.606309,0.830841,0.579374,0.864865,0.765258,...,0.864724,0.729443,0.891747,0.602326,0.416667,0.238095,0.348718,0.901961,0.861981,0.861981
19731,0.074766,0.000000,0.919747,0.536006,0.701769,0.607836,0.825302,0.582178,0.864865,0.765258,...,0.864724,0.729443,0.887460,0.602326,0.421053,0.250000,0.361538,0.900452,0.985726,0.985726
19732,0.242991,0.142857,0.919747,0.538666,0.692651,0.627198,0.818378,0.603988,0.864865,0.771233,...,0.864724,0.729443,0.883173,0.602326,0.425439,0.261905,0.374359,0.898944,0.583979,0.583979
19733,0.383178,0.142857,0.919747,0.549491,0.677054,0.634717,0.805085,0.585294,0.864865,0.773794,...,0.864724,0.730581,0.878885,0.602326,0.429825,0.273810,0.387179,0.897436,0.126371,0.126371


<h3> DataSet: beijing_pm2.5 </h3>

In [134]:
df = pd.read_csv(r'.\preprocess\source\beijing_pm2.5\PRSA_data_2010.1.1-2014.12.31.csv')
df

Unnamed: 0,No,year,month,day,hour,pm2.5,DEWP,TEMP,PRES,cbwd,Iws,Is,Ir
0,1,2010,1,1,0,,-21,-11.0,1021.0,NW,1.79,0,0
1,2,2010,1,1,1,,-21,-12.0,1020.0,NW,4.92,0,0
2,3,2010,1,1,2,,-21,-11.0,1019.0,NW,6.71,0,0
3,4,2010,1,1,3,,-21,-14.0,1019.0,NW,9.84,0,0
4,5,2010,1,1,4,,-20,-12.0,1018.0,NW,12.97,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
43819,43820,2014,12,31,19,8.0,-23,-2.0,1034.0,NW,231.97,0,0
43820,43821,2014,12,31,20,10.0,-22,-3.0,1034.0,NW,237.78,0,0
43821,43822,2014,12,31,21,10.0,-22,-3.0,1034.0,NW,242.70,0,0
43822,43823,2014,12,31,22,8.0,-22,-4.0,1034.0,NW,246.72,0,0


In [135]:
df.isnull().sum() #info()

No          0
year        0
month       0
day         0
hour        0
pm2.5    2067
DEWP        0
TEMP        0
PRES        0
cbwd        0
Iws         0
Is          0
Ir          0
dtype: int64

In [136]:
# # filling or removing for NaN
df.interpolate(inplace=True) # 通过插值算法估算出缺失的数据，但只能填补在数据中间的缺失值。
df.dropna(axis=0, inplace=True)
df.drop(['No','year','month','day','hour','cbwd'], axis=1, inplace=True)
df

  df.interpolate(inplace=True) # 通过插值算法估算出缺失的数据，但只能填补在数据中间的缺失值。


Unnamed: 0,pm2.5,DEWP,TEMP,PRES,Iws,Is,Ir
24,129.0,-16,-4.0,1020.0,1.79,0,0
25,148.0,-15,-4.0,1020.0,2.68,0,0
26,159.0,-11,-5.0,1021.0,3.57,0,0
27,181.0,-7,-5.0,1022.0,5.36,1,0
28,138.0,-7,-5.0,1022.0,6.25,2,0
...,...,...,...,...,...,...,...
43819,8.0,-23,-2.0,1034.0,231.97,0,0
43820,10.0,-22,-3.0,1034.0,237.78,0,0
43821,10.0,-22,-3.0,1034.0,242.70,0,0
43822,8.0,-22,-4.0,1034.0,246.72,0,0


In [137]:
df.isnull().sum()

pm2.5    0
DEWP     0
TEMP     0
PRES     0
Iws      0
Is       0
Ir       0
dtype: int64

In [113]:
# scaling
df[df.columns.tolist()]=MinMaxScaler().fit_transform(df[df.columns.tolist()])

<h3> DataSet: electricity </h3>

In [212]:
df = pd.read_table(r'.\preprocess\source\electricity\Delete\electricity.txt',sep=',',header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,311,312,313,314,315,316,317,318,319,320
0,14.0,69.0,234.0,415.0,215.0,1056.0,29.0,840.0,226.0,265.0,...,676.0,372.0,80100.0,4719.0,5002.0,48.0,38.0,1558.0,182.0,2162.0
1,18.0,92.0,312.0,556.0,292.0,1363.0,29.0,1102.0,271.0,340.0,...,805.0,452.0,95200.0,4643.0,6617.0,65.0,47.0,2177.0,253.0,2835.0
2,21.0,96.0,312.0,560.0,272.0,1240.0,29.0,1025.0,270.0,300.0,...,817.0,430.0,96600.0,4285.0,6571.0,64.0,43.0,2193.0,218.0,2764.0
3,20.0,92.0,312.0,443.0,213.0,845.0,24.0,833.0,179.0,211.0,...,801.0,291.0,94500.0,4222.0,6365.0,65.0,39.0,1315.0,195.0,2735.0
4,22.0,91.0,312.0,346.0,190.0,647.0,16.0,733.0,186.0,179.0,...,807.0,279.0,91300.0,4116.0,6298.0,75.0,40.0,1378.0,191.0,2721.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26299,11.0,116.0,8.0,844.0,384.0,1590.0,51.0,1412.0,407.0,351.0,...,1897.0,1589.0,166500.0,9917.0,10412.0,324.0,21.0,1870.0,162.0,2773.0
26300,11.0,103.0,8.0,749.0,371.0,1366.0,47.0,1265.0,369.0,327.0,...,1374.0,1336.0,158800.0,6812.0,8956.0,302.0,20.0,1506.0,438.0,2755.0
26301,12.0,93.0,8.0,650.0,346.0,1282.0,48.0,1079.0,308.0,283.0,...,938.0,1311.0,154300.0,6602.0,5910.0,302.0,18.0,1864.0,621.0,2650.0
26302,10.0,92.0,8.0,646.0,349.0,1261.0,48.0,1009.0,288.0,292.0,...,833.0,1227.0,141900.0,6546.0,5502.0,259.0,33.0,2623.0,783.0,2719.0


In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26304 entries, 0 to 26303
Columns: 321 entries, 0 to 320
dtypes: float64(321)
memory usage: 64.4 MB


In [163]:
df.isna().sum().sum()

0

In [206]:
df.std(axis=0) # 計算 DataFrame df 中每個欄位的標準差。
# list(df.std(axis=0)) # 將標準差轉換為list列表。
print('最大值索引:', df.std(axis=0).idxmax())
print('最小值索引:', df.std(axis=0).idxmin())

最大值索引: 313
最小值索引: 121


In [208]:
# list(df.std(axis=0)).index(min(list(df.std(axis=0))))

In [210]:
np.argsort(list(df.std(axis=0)))[::-1] # 從大到小 排序後索引位置的 NumPy 陣列。

array([313, 155, 236, 315, 186, 290, 167, 314, 179, 308, 302, 296,  89,
       199, 175, 153, 233,  93, 276, 128, 292, 309, 150, 301,  91, 281,
        88,  92,  55, 306,  94, 135, 177, 133, 131,  34, 182, 181, 268,
       311, 166,  40, 172, 193, 187, 163, 159, 119, 262, 254, 226, 189,
       178, 271, 246, 304, 312, 169, 250, 168, 204, 280, 205, 162, 158,
       264, 293, 103, 230,  77,  95, 154, 132, 173, 320, 164, 318, 151,
       266, 297, 112, 294, 100, 255, 270, 300, 295,  96, 211, 184, 198,
       148,  87, 307, 174, 171, 183, 149, 289, 237, 303, 278, 305,  75,
       196, 185, 126, 157, 277, 160,  81,  15, 291, 176, 310, 279, 152,
       225,  59, 231, 251, 267, 212, 223, 210,  21, 109, 284, 124, 288,
       220, 106, 134, 319, 217, 161, 165,  58,  80, 116, 221,   5, 248,
       113, 316,  83, 240, 138, 105,  32, 141,   7,  41, 170, 272,  79,
       200, 263, 180, 282, 206, 245, 191, 156,  36,  18,  90, 219, 286,
       287, 202, 129, 222, 123, 146, 203, 253, 140, 244, 143, 25

In [164]:
selected_feature_idx = np.argsort(list(df.std(axis=0)))[::-1]
selected_feature_idx

array([313, 155, 236, 315, 186, 290, 167, 314, 179, 308, 302, 296,  89,
       199, 175, 153, 233,  93, 276, 128, 292, 309, 150, 301,  91, 281,
        88,  92,  55, 306,  94, 135, 177, 133, 131,  34, 182, 181, 268,
       311, 166,  40, 172, 193, 187, 163, 159, 119, 262, 254, 226, 189,
       178, 271, 246, 304, 312, 169, 250, 168, 204, 280, 205, 162, 158,
       264, 293, 103, 230,  77,  95, 154, 132, 173, 320, 164, 318, 151,
       266, 297, 112, 294, 100, 255, 270, 300, 295,  96, 211, 184, 198,
       148,  87, 307, 174, 171, 183, 149, 289, 237, 303, 278, 305,  75,
       196, 185, 126, 157, 277, 160,  81,  15, 291, 176, 310, 279, 152,
       225,  59, 231, 251, 267, 212, 223, 210,  21, 109, 284, 124, 288,
       220, 106, 134, 319, 217, 161, 165,  58,  80, 116, 221,   5, 248,
       113, 316,  83, 240, 138, 105,  32, 141,   7,  41, 170, 272,  79,
       200, 263, 180, 282, 206, 245, 191, 156,  36,  18,  90, 219, 286,
       287, 202, 129, 222, 123, 146, 203, 253, 140, 244, 143, 25

In [176]:
selected_feature_idx[0]

313

In [177]:
selected_feature_idx[1:11]

array([155, 236, 315, 186, 290, 167, 314, 179, 308, 302], dtype=int64)

In [165]:
df[df.columns.tolist()]=MinMaxScaler().fit_transform(df[df.columns.tolist()])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,311,312,313,314,315,316,317,318,319,320
0,0.100000,0.233108,0.389351,0.354701,0.393053,0.532258,0.162921,0.399619,0.376667,0.352394,...,0.117586,0.111377,0.104843,0.165162,0.125906,0.036090,0.185366,0.350191,0.132944,0.358244
1,0.128571,0.310811,0.519135,0.475214,0.533821,0.686996,0.162921,0.524263,0.451667,0.452128,...,0.140024,0.135329,0.124607,0.162502,0.166558,0.048872,0.229268,0.489323,0.184806,0.469760
2,0.150000,0.324324,0.519135,0.478632,0.497258,0.625000,0.162921,0.487631,0.450000,0.398936,...,0.142112,0.128743,0.126440,0.149972,0.165400,0.048120,0.209756,0.492920,0.159240,0.457995
3,0.142857,0.310811,0.519135,0.378632,0.389397,0.425907,0.134831,0.396289,0.298333,0.280585,...,0.139329,0.087126,0.123691,0.147767,0.160214,0.048872,0.190244,0.295572,0.142440,0.453190
4,0.157143,0.307432,0.519135,0.295726,0.347349,0.326109,0.089888,0.348716,0.310000,0.238032,...,0.140372,0.083533,0.119503,0.144057,0.158528,0.056391,0.195122,0.309733,0.139518,0.450870
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26299,0.078571,0.391892,0.013311,0.721368,0.702011,0.801411,0.286517,0.671741,0.678333,0.466755,...,0.329970,0.475749,0.217932,0.347088,0.262082,0.243609,0.102439,0.420319,0.118335,0.459486
26300,0.078571,0.347973,0.013311,0.640171,0.678245,0.688508,0.264045,0.601808,0.615000,0.434840,...,0.238998,0.400000,0.207853,0.238415,0.225433,0.227068,0.097561,0.338503,0.319942,0.456504
26301,0.085714,0.314189,0.013311,0.555556,0.632541,0.646169,0.269663,0.513321,0.513333,0.376330,...,0.163159,0.392515,0.201963,0.231065,0.148762,0.227068,0.087805,0.418971,0.453616,0.439105
26302,0.071429,0.310811,0.013311,0.552137,0.638026,0.635585,0.269663,0.480019,0.480000,0.388298,...,0.144895,0.367365,0.185733,0.229105,0.138492,0.194737,0.160976,0.589571,0.571950,0.450539


In [174]:
y = df.iloc[:,selected_feature_idx[0]].values
print(y.shape)
pd.DataFrame(y)

(26304,)


Unnamed: 0,0
0,0.104843
1,0.124607
2,0.126440
3,0.123691
4,0.119503
...,...
26299,0.217932
26300,0.207853
26301,0.201963
26302,0.185733


In [175]:
X = df.iloc[:,selected_feature_idx[1:11]].values
print(X.shape)
pd.DataFrame(X)

(26304, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.174265,0.371845,0.125906,0.266300,0.120671,0.303012,0.165162,0.198269,0.102060,0.080025
1,0.183098,0.351736,0.166558,0.236991,0.105973,0.290789,0.162502,0.200275,0.136878,0.106085
2,0.180630,0.354899,0.165400,0.252239,0.102229,0.285650,0.149972,0.193273,0.135655,0.104418
3,0.178782,0.338995,0.160214,0.254828,0.101976,0.288926,0.147767,0.179819,0.136827,0.104199
4,0.183910,0.324170,0.158528,0.251004,0.102608,0.285978,0.144057,0.189300,0.137235,0.103848
...,...,...,...,...,...,...,...,...,...,...
26299,0.315890,0.626097,0.262082,0.508301,0.268815,0.422783,0.347088,0.398466,0.280485,0.212258
26300,0.216960,0.626440,0.225433,0.333801,0.199449,0.278402,0.238415,0.189811,0.200907,0.121792
26301,0.208748,0.627152,0.148762,0.300668,0.186496,0.253465,0.231065,0.169866,0.175061,0.116659
26302,0.193561,0.543464,0.138492,0.282807,0.180450,0.246095,0.229105,0.187844,0.170779,0.112052


<h3> DataSet: metro_interstate </h3>

In [63]:
df = pd.read_csv(r'.\preprocess\source\metro_interstate\Metro_Interstate_Traffic_Volume.csv')
df

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918
...,...,...,...,...,...,...,...,...,...
48199,,283.45,0.0,0.0,75,Clouds,broken clouds,2018-09-30 19:00:00,3543
48200,,282.76,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 20:00:00,2781
48201,,282.73,0.0,0.0,90,Thunderstorm,proximity thunderstorm,2018-09-30 21:00:00,2159
48202,,282.09,0.0,0.0,90,Clouds,overcast clouds,2018-09-30 22:00:00,1450


In [64]:
df.drop(['holiday', 'date_time', 'weather_main', 'weather_description'], axis=1, inplace=True)
df

Unnamed: 0,temp,rain_1h,snow_1h,clouds_all,traffic_volume
0,288.28,0.0,0.0,40,5545
1,289.36,0.0,0.0,75,4516
2,289.58,0.0,0.0,90,4767
3,290.13,0.0,0.0,90,5026
4,291.14,0.0,0.0,75,4918
...,...,...,...,...,...
48199,283.45,0.0,0.0,75,3543
48200,282.76,0.0,0.0,90,2781
48201,282.73,0.0,0.0,90,2159
48202,282.09,0.0,0.0,90,1450


<h3> DataSet: solar_energy  </h3>

In [213]:
df = pd.read_table(r'.\preprocess\source\solar_energy\solar_AL.txt',sep=',',header=None)
# df.to_csv(r'.\preprocess\source\solar_energy\solar_AL.csv')
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,127,128,129,130,131,132,133,134,135,136
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
df = df.iloc[45:,:].reset_index()
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,127,128,129,130,131,132,133,134,135,136
0,0.25,0.10,0.05,0.15,4.15,11.40,2.55,0.80,0.00,0.90,...,0.30,0.05,0.25,1.75,0.55,0.05,0.85,0.00,7.25,5.00
1,0.20,1.55,1.00,0.50,3.75,15.80,4.10,3.75,0.00,3.55,...,0.70,0.65,1.15,3.45,1.45,0.55,3.35,0.40,8.95,7.00
2,0.20,4.25,3.90,1.80,5.25,21.50,5.40,5.35,0.15,4.70,...,1.40,3.95,1.35,0.65,2.65,3.50,4.60,0.85,10.05,8.90
3,0.15,6.25,5.10,3.15,4.15,26.85,6.65,6.45,0.50,5.55,...,2.45,1.10,2.15,0.25,5.05,5.45,5.35,4.60,11.40,10.60
4,1.00,7.95,6.35,3.35,0.35,26.30,7.70,6.95,1.00,6.70,...,4.60,3.00,1.75,3.10,6.60,6.00,7.70,5.40,12.25,11.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52510,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
52511,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
52512,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
52513,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [77]:
selected_feature_idx = np.argsort(list(df.std(axis=0)))[::-1]
selected_feature_idx

array([  0,  74,   6,  41, 122,  24,  29,   1,  69,  32,  70,   5, 125,
       106, 105,  45,  71, 126,  81, 137,  23,  15,  77,  43,  79, 108,
        13, 123,  93,  67,  51,  75,  25,  17,  57,  56,  62, 110,  21,
       124,   9,  54,   4,  87,  33,  22,  76,  37, 121, 112, 135,  18,
        60, 119, 117, 129,  28,  50,  68,  39, 100,  66,  49, 118,  53,
       134,   2, 115,  42, 111,  14,  52,   3,  30,  20, 133,  40,  10,
        85,  19,  44,  63, 104,  27,  82, 132,  80,  11, 103,  78,  96,
       128,  73,  65, 107,  95,  92,   8,  48,  94,  59,  88,  34,  90,
        98,  86, 109,  64,  55,  35,  58,   7, 114,  83, 136,  46,  31,
        97,  89,  26,  99, 131,  36,  38, 101, 102,  47,  61,  84, 120,
        91,  72,  16, 113,  12, 127, 130, 116], dtype=int64)

In [91]:
df.columns = df.columns.astype(str) # 将列名转换为字符串
df[df.columns.tolist()]=MinMaxScaler().fit_transform(df[df.columns.tolist()])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,127,128,129,130,131,132,133,134,135,136
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52555,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52557,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
52558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
