In [None]:
# 导入必要库
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import warnings
warnings.filterwarnings("ignore")

# 加载数据
df = pd.read_csv("bi.csv", encoding="latin1")

# 1. 数据概览
print(df.info())
print(df.describe())
print(df.isnull().sum())


In [None]:

# 2. 数据清洗
# 统一性别表示
df['gender'] = df['gender'].map({'M': 'Male', 'F': 'Female', 'Male': 'Male', 'Female': 'Female'})

# 统一教育背景
df['prevEducation'] = df['prevEducation'].replace({'High School': 'HighSchool', 'Highschool': 'HighSchool'})

# 填充缺失值（用中位数）
df['Python'].fillna(df['Python'].median(), inplace=True)


In [None]:

# 3. 可视化分析
# 年龄分布
plt.figure(figsize=(8, 5))
sns.histplot(df['Age'], bins=15, kde=True)
plt.title("Age Distribution")
plt.show()

# 性别分布
sns.countplot(x='gender', data=df)
plt.title("Gender Distribution")
plt.show()

# 成绩分布
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
sns.histplot(df['Python'], kde=True, ax=axes[0])
sns.histplot(df['DB'], kde=True, ax=axes[1])
plt.show()


In [None]:

# 4. 相关性分析
corr = df[['Age', 'entryEXAM', 'studyHOURS', 'Python', 'DB']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


In [None]:

# 5. 机器学习：预测 Python 成绩
# 选择特征和目标
features = ['Age', 'entryEXAM', 'studyHOURS', 'DB']
X = df[features]
y = df['Python']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 训练模型
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 预测与评估
y_pred = model.predict(X_test)
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# 特征重要性
importance = pd.Series(model.feature_importances_, index=features).sort_values(ascending=False)
importance.plot(kind='bar')
plt.title("Feature Importance")
plt.show()