In [43]:

# Numpy: 提供数组支持
import numpy as np

# SciPy: 提供矩阵支持，科学计算
import scipy

# Matplotlib: 可视化工具
import matplotlib.pyplot as plt

# Pandas: 数据DataFrame
import pandas as pd

#导入随机森林
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTENC


In [44]:
data = pd.read_csv('../../data/processed.csv', encoding='utf-8')
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7988 entries, 0 to 7987
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               7988 non-null   int64  
 1   happiness        7988 non-null   int64  
 2   survey_type      7988 non-null   int64  
 3   province         7988 non-null   int64  
 4   city             7988 non-null   int64  
 5   county           7988 non-null   int64  
 6   survey_time      7988 non-null   int64  
 7   gender           7988 non-null   int64  
 8   birth            7988 non-null   int64  
 9   nationality      7988 non-null   int64  
 10  religion         7988 non-null   int64  
 11  religion_freq    7988 non-null   int64  
 12  edu              7988 non-null   int64  
 13  income           7988 non-null   float64
 14  political        7988 non-null   int64  
 15  floor_area       7988 non-null   float64
 16  height_cm        7988 non-null   float64
 17  weight_jin    

# 欠采样

In [45]:
X = data.drop('happiness', axis=1)  # 特征
y = data['happiness']  # 标签
print(pd.DataFrame(y).value_counts())

happiness
4            4818
5            1410
3            1159
2             497
1             104
Name: count, dtype: int64


In [46]:
sampling_strategy = {1: 104, 2: 497, 3: 1159, 4: 3000, 5: 1410}
rus = RandomUnderSampler(random_state=42, sampling_strategy=sampling_strategy)
X_resampled, y_resampled = rus.fit_resample(X, y)
print(pd.DataFrame(y_resampled).value_counts())


happiness
4            3000
5            1410
3            1159
2             497
1             104
Name: count, dtype: int64


# 数据增强SMOTE

In [47]:
X_resampled = X
y_resampled = y

In [48]:
use_resampling_strategy = {
    1: 2300,  
    2: 2300,  
    3: 2500,  
    4: 3000,  
    5: 3000   
}
no_resampling_strategy = {
    1:4700,
    2:4818,
    3:4800,
    4:4818,
    5:4850

}
# define pipeline
smote_nc = SMOTENC(sampling_strategy=no_resampling_strategy, categorical_features=[2, 4, 5, 9],\
               k_neighbors = 100, random_state = 42)
# transform the dataset
X, y = smote_nc.fit_resample(X_resampled, y_resampled)

print(pd.DataFrame(y).value_counts())



happiness
5            4850
2            4818
4            4818
3            4800
1            4700
Name: count, dtype: int64


In [49]:
#随机森林测试
clf = RandomForestClassifier(n_estimators=100,random_state=42)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_pred = clf.fit(x_train, y_train).predict(x_test)
print('准确率:', clf.score(x_test, y_test))
print('报告:', classification_report(y_test, y_pred))

准确率: 0.858482701125469
报告:               precision    recall  f1-score   support

           1       0.99      0.98      0.99       926
           2       0.95      0.91      0.93       964
           3       0.84      0.83      0.83       948
           4       0.67      0.84      0.74       976
           5       0.91      0.75      0.82       984

    accuracy                           0.86      4798
   macro avg       0.87      0.86      0.86      4798
weighted avg       0.87      0.86      0.86      4798



# 主成分分析PCA

In [50]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# 加载数据集
# data = old3_data
# X 在数据增强那
# 数据标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA()


# 拟合PCA模型并转换数据
X_pca = pca.fit_transform(X_scaled)
explained_variance_ratios = pca.explained_variance_ratio_
# 选择方差解释率大于 0.01 的主成分
selected_components = np.where(explained_variance_ratios > 0.01)[0]

# 创建新的 PCA 对象，仅保留选择的主成分
pca_selected = PCA(n_components=len(selected_components))
X_pca_selected = pca_selected.fit_transform(X)  # 降维
# 打印结果
print("原始数据形状：", X.shape)
print("降维后数据形状：",X_pca_selected.shape)
print("方差解释率：", [f"{x:.5f}" for x in pca.explained_variance_ratio_])




原始数据形状： (23986, 36)
降维后数据形状： (23986, 32)
方差解释率： ['0.15969', '0.07649', '0.05409', '0.04618', '0.04575', '0.04470', '0.04011', '0.03410', '0.02949', '0.02798', '0.02672', '0.02581', '0.02514', '0.02391', '0.02305', '0.02225', '0.02158', '0.02123', '0.01964', '0.01937', '0.01901', '0.01812', '0.01754', '0.01637', '0.01627', '0.01577', '0.01514', '0.01365', '0.01341', '0.01314', '0.01071', '0.01016', '0.00953', '0.00856', '0.00820', '0.00713']


In [51]:
X_pca_selected.shape

(23986, 32)

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# 使用随机森林进行分类
x=X_pca_selected
#y 在数据增强那
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)
clf = RandomForestClassifier(n_estimators=100,random_state=42)
y_pred = clf.fit(X_train,y_train).predict(X_test)
print('准确率:',clf.score(X_test,y_test))
print('报告:',classification_report(y_test,y_pred))


准确率: 0.8080450187578158
报告:               precision    recall  f1-score   support

           1       0.98      0.97      0.97       926
           2       0.86      0.86      0.86       964
           3       0.77      0.70      0.73       948
           4       0.64      0.85      0.73       976
           5       0.87      0.67      0.76       984

    accuracy                           0.81      4798
   macro avg       0.82      0.81      0.81      4798
weighted avg       0.82      0.81      0.81      4798

