### 导入处理所需的库

In [30]:
%pip install openpyxl
%pip install pandas
%pip install numpy
%pip install sklearn 
%pip install statsmodels

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mCollecting sklearn
  Using cached sklearn-0.0.post7.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[18 lines of output][0m
  [31m   [0m The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
  [31m   [0m rather than 'sklearn' for pip commands.
  [31m   [0m 
  [31m   [0m Here is how to fix this error in the main use cases:
  [31m   [0m - use 'pip install scikit-learn' rather than 'pip install sklearn'
  [31m   [0m - replace 'sklearn' by 'scikit-learn' in your pip requirements files
  [31m   [0m   (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)

### 对所给的数据进行一定的处理 
#### 初步观察数据
> 1. 首先可以看出，一些数据对于本题可能的相关性并不大，如xlsx表中提供的分娩方式全都相同
> 2. 然后进行数据的加工，提取、清洗

In [31]:
path = './extr.xlsx'
df = pd.read_excel(path)
columns = ['母亲年龄', '婚姻状况', '教育程度', '妊娠时间（周数）', 'CBTS', 'EPDS', 'HADS', '婴儿行为特征', '整晚睡眠时间（时：分：秒）', '睡醒次数','入睡方式']
data = df[columns][:390]
data

Unnamed: 0,母亲年龄,婚姻状况,教育程度,妊娠时间（周数）,CBTS,EPDS,HADS,婴儿行为特征,整晚睡眠时间（时：分：秒）,睡醒次数,入睡方式
0,34,2,5,37.0,3,13,9,中等型,10:00:00,3.0,2.0
1,33,2,5,42.0,0,0,3,安静型,11:00:00,0.0,4.0
2,37,2,5,41.0,4,8,9,安静型,12:00:00,1.0,2.0
3,31,2,5,37.5,6,16,13,安静型,11:00:00,2.0,1.0
4,36,1,5,40.0,1,3,3,中等型,10:30:00,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...
385,27,2,5,42.0,12,20,14,中等型,09:00:00,1.0,2.0
386,31,2,3,39.0,4,7,12,中等型,11:00:00,2.0,4.0
387,31,2,5,41.6,0,3,1,安静型,10:30:00,2.0,1.0
388,27,2,3,40.0,1,10,5,安静型,06:00:00,2.0,1.0


#### 根据[匹兹堡睡眠质量指数(PSQI)](https://en.wikipedia.org/wiki/Pittsburgh_Sleep_Quality_Index)简单地计算睡眠效率即认定为睡眠质量

In [32]:
# 将整晚睡眠时间转换为datetime格式，对无效的时间数据进行清洗
data['整晚睡眠时间（时：分：秒）'] = pd.to_datetime(data['整晚睡眠时间（时：分：秒）'], format='%H:%M:%S', errors='coerce')

# 去除无效的时间数据
data = data.dropna(subset=['整晚睡眠时间（时：分：秒）'])

# 计算实际入睡时间（以小时为单位）
data['实际入睡时间'] = data['整晚睡眠时间（时：分：秒）'].apply(lambda x: x.hour + x.minute / 60 + x.second / 3600)

# 计算平均入睡时间
average_bedtime = data['实际入睡时间'].mean()

# 使用平均入睡时间作为标准入睡时间
data['标准入睡时间'] = average_bedtime

# 计算睡眠效率
data['睡眠效率'] = (data['实际入睡时间'] / data['标准入睡时间']) * 100

columns=['母亲年龄','婚姻状况',	'教育程度',	'妊娠时间（周数）',	'CBTS',	'EPDS',	'HADS',	'婴儿行为特征','睡眠效率']
data = data[columns]
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['实际入睡时间'] = data['整晚睡眠时间（时：分：秒）'].apply(lambda x: x.hour + x.minute / 60 + x.second / 3600)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['标准入睡时间'] = average_bedtime
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['睡眠效率'] = (data['实际入睡时间'] / data['标准入睡时间']) * 100


Unnamed: 0,母亲年龄,婚姻状况,教育程度,妊娠时间（周数）,CBTS,EPDS,HADS,婴儿行为特征,睡眠效率
0,34,2,5,37.0,3,13,9,中等型,98.449858
1,33,2,5,42.0,0,0,3,安静型,108.294843
2,37,2,5,41.0,4,8,9,安静型,118.139829
3,31,2,5,37.5,6,16,13,安静型,108.294843
4,36,1,5,40.0,1,3,3,中等型,103.372351
...,...,...,...,...,...,...,...,...,...
385,27,2,5,42.0,12,20,14,中等型,88.604872
386,31,2,3,39.0,4,7,12,中等型,108.294843
387,31,2,5,41.6,0,3,1,安静型,103.372351
388,27,2,3,40.0,1,10,5,安静型,59.069915


#### 稍稍处理数据，方便模型计算拟合

In [33]:
# 创建一个映射关系将字符串映射到数值
behavior_mapping = {'安静型': 0, '中等型': 1, '矛盾型': 2} 

# 使用映射关系将字符串转换为数值
data['婴儿行为特征']=data['婴儿行为特征'].map(behavior_mapping)
# display(data['婴儿行为特征'].dtype)
display(data)


# 转换'婴儿行为特征'和'睡眠效率'列的数据类型为浮点数
data['婴儿行为特征'] = data['婴儿行为特征'].astype(float)
data['睡眠效率'] = data['睡眠效率'].astype(float)

# 提取自变量和因变量
X = data[['母亲年龄', '婚姻状况', '教育程度', '妊娠时间（周数）', 'CBTS', 'EPDS', 'HADS']]
y_baby_behavior = data['婴儿行为特征']
y_sleep_efficiency = data['睡眠效率']

# 为自变量添加常数项
X = sm.add_constant(X)

# 拟合婴儿行为特征的线性回归模型
model_baby_behavior = sm.OLS(y_baby_behavior, X).fit()
print("婴儿行为特征模型拟合结果：")
print(model_baby_behavior.summary())

# 拟合睡眠效率的线性回归模型
model_sleep_efficiency = sm.OLS(y_sleep_efficiency, X).fit()
print("\n睡眠效率模型拟合结果：")
print(model_sleep_efficiency.summary())


Unnamed: 0,母亲年龄,婚姻状况,教育程度,妊娠时间（周数）,CBTS,EPDS,HADS,婴儿行为特征,睡眠效率
0,34,2,5,37.0,3,13,9,1,98.449858
1,33,2,5,42.0,0,0,3,0,108.294843
2,37,2,5,41.0,4,8,9,0,118.139829
3,31,2,5,37.5,6,16,13,0,108.294843
4,36,1,5,40.0,1,3,3,1,103.372351
...,...,...,...,...,...,...,...,...,...
385,27,2,5,42.0,12,20,14,1,88.604872
386,31,2,3,39.0,4,7,12,1,108.294843
387,31,2,5,41.6,0,3,1,0,103.372351
388,27,2,3,40.0,1,10,5,0,59.069915


婴儿行为特征模型拟合结果：
                            OLS Regression Results                            
Dep. Variable:                 婴儿行为特征   R-squared:                       0.037
Model:                            OLS   Adj. R-squared:                  0.019
Method:                 Least Squares   F-statistic:                     2.098
Date:                Tue, 22 Aug 2023   Prob (F-statistic):             0.0429
Time:                        00:44:09   Log-Likelihood:                -359.96
No. Observations:                 389   AIC:                             735.9
Df Residuals:                     381   BIC:                             767.6
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0411      0.744      