导入必要库

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 可选：设置matplotlib的显示风格，让图表更好看
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6) # 设置默认图表大小
plt.rcParams['font.size'] = 12 # 设置默认字体大小

读取数据

In [5]:
data_path = '../data/energydata_complete.csv'

try:
    df = pd.read_csv(data_path)
    print("数据加载成功！")
except FileNotFoundError:
    print(f"错误：文件未找到。请检查路径是否正确：{data_path}")
    df = None # 设置df为None以避免后续错误

# 确认数据是否已加载
if df is not None:
    print(f"数据集包含 {df.shape[0]} 行和 {df.shape[1]} 列。")

数据加载成功！
数据集包含 19735 行和 29 列。


In [6]:
if df is not None:
    print("--- 3.1 查看前几行数据 ---")
    # 快速查看 DataFrame 的前5行，了解数据的大致结构和内容
    display(df.head()) # 使用 display 而非 print，在 Jupyter Notebook 中显示更美观

    print("\n--- 3.2 查看数据维度 ---")
    # 打印 DataFrame 的行数和列数
    print(f"数据集包含 {df.shape[0]} 行和 {df.shape[1]} 列。")

    print("\n--- 3.3 查看列名 ---")
    # 打印所有列的名称
    print("数据集列名：")
    print(df.columns.tolist()) # 将列名转换为列表形式，方便查看

    print("\n--- 3.4 查看数据类型和非空值信息 ---")
    # 打印每列的数据类型、非空值数量和内存使用情况
    # 这对于识别需要转换的数据类型（特别是 'date' 列）和初步了解缺失值情况非常重要
    df.info()

    print("\n--- 3.5 初步描述性统计 ---")
    # 获取数值列的基本统计信息（计数、均值、标准差、最小值、最大值、四分位数）
    # 特别关注 'Appliances' 列的统计数据
    display(df.describe())
else:
    print("DataFrame 未加载，无法进行初始数据概览。")

--- 3.1 查看前几行数据 ---


Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,11-01-2016 17:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,11-01-2016 17:10,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.48,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,11-01-2016 17:20,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.37,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,11-01-2016 17:30,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.41039,45.41039
4,11-01-2016 17:40,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.13,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097



--- 3.2 查看数据维度 ---
数据集包含 19735 行和 29 列。

--- 3.3 查看列名 ---
数据集列名：
['date', 'Appliances', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5', 'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'Press_mm_hg', 'RH_out', 'Windspeed', 'Visibility', 'Tdewpoint', 'rv1', 'rv2']

--- 3.4 查看数据类型和非空值信息 ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19735 entries, 0 to 19734
Data columns (total 29 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   date         19735 non-null  object 
 1   Appliances   19735 non-null  int64  
 2   lights       19735 non-null  int64  
 3   T1           19735 non-null  float64
 4   RH_1         19735 non-null  float64
 5   T2           19735 non-null  float64
 6   RH_2         19735 non-null  float64
 7   T3           19735 non-null  float64
 8   RH_3         19735 non-null  float64
 9   T4           19735 non-null  float64
 10  RH_4         19735 non-null  float64
 11  T

Unnamed: 0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
count,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,...,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0,19735.0
mean,97.694958,3.801875,21.686571,40.259739,20.341219,40.42042,22.267611,39.2425,20.855335,39.026904,...,19.485828,41.552401,7.41258,755.522602,79.750418,4.039752,38.330834,3.760995,24.988033,24.988033
std,102.524891,7.935988,1.606066,3.979299,2.192974,4.069813,2.006111,3.254576,2.042884,4.341321,...,2.014712,4.151497,5.318464,7.399441,14.901088,2.451221,11.794719,4.195248,14.496634,14.496634
min,10.0,0.0,16.79,27.023333,16.1,20.463333,17.2,28.766667,15.1,27.66,...,14.89,29.166667,-5.0,729.3,24.0,0.0,1.0,-6.6,0.005322,0.005322
25%,50.0,0.0,20.76,37.333333,18.79,37.9,20.79,36.9,19.53,35.53,...,18.0,38.5,3.67,750.933333,70.333333,2.0,29.0,0.9,12.497889,12.497889
50%,60.0,0.0,21.6,39.656667,20.0,40.5,22.1,38.53,20.666667,38.4,...,19.39,40.9,6.92,756.1,83.666667,3.666667,40.0,3.43,24.897653,24.897653
75%,100.0,0.0,22.6,43.066667,21.5,43.26,23.29,41.76,22.1,42.156667,...,20.6,44.338095,10.4,760.933333,91.666667,5.5,40.0,6.57,37.583769,37.583769
max,1080.0,70.0,26.26,63.36,29.856667,56.026667,29.236,50.163333,26.2,51.09,...,24.5,53.326667,26.1,772.3,100.0,14.0,66.0,15.5,49.99653,49.99653


把data列转换为datatime对象