## 使用整个数据集预测成绩
概率统计与随机过程

In [15]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import median_absolute_error, mean_squared_error, r2_score, median_absolute_error

import scipy

data = pd.read_csv('cs15_22.csv')

#划分训练-测试集，测试集比例为0.25
X_train, X_test, y_train, y_test = train_test_split(data, data['概率统计与随机过程'], test_size = 0.25, random_state=42)

In [16]:
import numpy as np
# Calculate mae and rmse
def evaluate_predictions(predictions, true):
    mae = np.mean(abs(predictions - true))
    rmse = np.sqrt(np.mean((predictions - true) ** 2))

    return mae, rmse

In [17]:
median_pred = X_train['概率统计与随机过程'].median()

median_preds = [median_pred for _ in range(len(X_test))]

true = X_test['概率统计与随机过程']

In [18]:
# Display the naive baseline metrics
mb_mae, mb_rmse = evaluate_predictions(median_preds, true)
print('Median Baseline  MAE: {:.4f}'.format(mb_mae))
print('Median Baseline RMSE: {:.4f}'.format(mb_rmse))

Median Baseline  MAE: 20.3258
Median Baseline RMSE: 32.5037


In [19]:
def calculate_rmsea(y_test, predictions):
    # Calculate the root mean square error of approximation (RMSEA)
    residuals = predictions - y_test
    chi_squared = np.sum(residuals ** 2)  # Chi-squared statistic
    degrees_of_freedom = len(y_test)  # Degrees of freedom
    n = len(y_test)  # Number of data points
    rmsea = np.sqrt(chi_squared / (degrees_of_freedom * (n - 1)))
    return rmsea

def evaluate(X_train, X_test, y_train, y_test):
    # Names of models
    model_name_list = ['Linear Regression', 'ElasticNet Regression',
                      'Random Forest', 'Extra Trees', 'SVM',
                       'Gradient Boosted', 'Baseline']
    X_train = X_train.drop('概率统计与随机过程', axis='columns')
    X_test = X_test.drop('概率统计与随机过程', axis='columns')
    
    # Instantiate the models
    model1 = LinearRegression()
    model2 = ElasticNet(alpha=1.0, l1_ratio=0.5)
    model3 = RandomForestRegressor(n_estimators=100)
    model4 = ExtraTreesRegressor(n_estimators=100)
    model5 = SVR(kernel='rbf', degree=3, C=1.0, gamma='auto')
    model6 = GradientBoostingRegressor(n_estimators=50)
    
    # Dataframe for results
    results = pd.DataFrame(columns=['mae', 'rmse', 'r2', 'rmsea'], index=model_name_list)

    # Train and predict with each model
    for i, model in enumerate([model1, model2, model3, model4, model5, model6]):
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        # Metrics
        mae = median_absolute_error(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        r2 = r2_score(y_test, predictions)
        rmsea = calculate_rmsea(y_test, predictions)
        # Insert results into the dataframe
        model_name = model_name_list[i]
        results.loc[model_name, :] = [mae, rmse, r2, rmsea]
    
    # Median Value Baseline Metrics
    baseline = np.median(y_train)
    baseline_mae = np.mean(abs(baseline - y_test))
    baseline_rmse = np.sqrt(np.mean((baseline - y_test) ** 2))
    baseline_residuals = y_test - baseline
    baseline_sse = np.sum(baseline_residuals ** 2)
    y_mean = np.mean(y_test)
    y_diff = y_test - y_mean
    sst = np.sum(y_diff ** 2)
    baseline_r2 = 1 - (baseline_sse / sst)
    baseline_rmsea = calculate_rmsea(y_test, baseline)
    results.loc['Baseline', :] = [baseline_mae, baseline_rmse, baseline_r2, baseline_rmsea]
    
    return results

In [20]:
results = evaluate(X_train, X_test, y_train, y_test)
print(results)

                             mae       rmse        r2     rmsea
Linear Regression       4.449837  10.785893  0.876986  0.572456
ElasticNet Regression   4.368515   10.76372  0.877491  0.571279
Random Forest               3.66   9.168348  0.911116  0.486605
Extra Trees                 3.64   8.830014  0.917555  0.468648
SVM                    10.628571  32.438434 -0.112658  1.721653
Gradient Boosted        3.824501   9.452611  0.905519  0.501692
Baseline               20.325843  32.503716 -0.117141  1.725118


## 只使用父节点课程预测成绩
概率统计与随机过程
### 父节点课程
* 线性代数与解析几何
* 电路
* 大学物理Ⅱ-2
* 大学物理实验Ⅰ-2
* 离散数学a
* 算法分析与设计
* 数字逻辑电路

In [21]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import median_absolute_error, mean_squared_error, r2_score, median_absolute_error

import scipy

data = pd.read_csv('cs15_22_parentsOnly_Probability.csv')

#划分训练-测试集，测试集比例为0.25
X_train, X_test, y_train, y_test = train_test_split(data, data['概率统计与随机过程'], test_size = 0.25, random_state=42)

In [22]:
X_train.head()

Unnamed: 0,大学物理II-2,大学物理实验I-1,操作系统设计专题实验,数字逻辑电路,概率统计与随机过程,电路,离散数学a,线性代数与解析几何
974,84,83,0,68,61,70,81.0,81
620,88,87,87,90,86,82,89.0,0
777,87,87,92,84,72,78,82.0,91
427,92,83,98,83,73,88,93.0,81
199,91,98,98,78,83,93,96.0,86


In [23]:
import numpy as np
# Calculate mae and rmse
def evaluate_predictions(predictions, true):
    mae = np.mean(abs(predictions - true))
    rmse = np.sqrt(np.mean((predictions - true) ** 2))

    return mae, rmse

In [24]:
median_pred = X_train['概率统计与随机过程'].median()

median_preds = [median_pred for _ in range(len(X_test))]

true = X_test['概率统计与随机过程']

In [25]:
# Display the naive baseline metrics
mb_mae, mb_rmse = evaluate_predictions(median_preds, true)
print('Median Baseline  MAE: {:.4f}'.format(mb_mae))
print('Median Baseline RMSE: {:.4f}'.format(mb_rmse))

Median Baseline  MAE: 20.9635
Median Baseline RMSE: 33.1548


In [26]:
def calculate_rmsea(y_test, predictions):
    # Calculate the root mean square error of approximation (RMSEA)
    residuals = predictions - y_test
    chi_squared = np.sum(residuals ** 2)  # Chi-squared statistic
    degrees_of_freedom = len(y_test)  # Degrees of freedom
    n = len(y_test)  # Number of data points
    rmsea = np.sqrt(chi_squared / (degrees_of_freedom * (n - 1)))
    return rmsea

def evaluate(X_train, X_test, y_train, y_test):
    # Names of models
    model_name_list = ['Linear Regression', 'ElasticNet Regression',
                      'Random Forest', 'Extra Trees', 'SVM',
                       'Gradient Boosted', 'Baseline']
    X_train = X_train.drop('概率统计与随机过程', axis='columns')
    X_test = X_test.drop('概率统计与随机过程', axis='columns')
    
    # Instantiate the models
    model1 = LinearRegression()
    model2 = ElasticNet(alpha=1.0, l1_ratio=0.5)
    model3 = RandomForestRegressor(n_estimators=100)
    model4 = ExtraTreesRegressor(n_estimators=100)
    model5 = SVR(kernel='rbf', degree=3, C=1.0, gamma='auto')
    model6 = GradientBoostingRegressor(n_estimators=50)
    
    # Dataframe for results
    results = pd.DataFrame(columns=['mae', 'rmse', 'r2', 'rmsea'], index=model_name_list)

    # Train and predict with each model
    for i, model in enumerate([model1, model2, model3, model4, model5, model6]):
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        # Metrics
        mae = median_absolute_error(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        r2 = r2_score(y_test, predictions)
        rmsea = calculate_rmsea(y_test, predictions)
        # Insert results into the dataframe
        model_name = model_name_list[i]
        results.loc[model_name, :] = [mae, rmse, r2, rmsea]
    
    # Median Value Baseline Metrics
    baseline = np.median(y_train)
    baseline_mae = np.mean(abs(baseline - y_test))
    baseline_rmse = np.sqrt(np.mean((baseline - y_test) ** 2))
    baseline_residuals = y_test - baseline
    baseline_sse = np.sum(baseline_residuals ** 2)
    y_mean = np.mean(y_test)
    y_diff = y_test - y_mean
    sst = np.sum(y_diff ** 2)
    baseline_r2 = 1 - (baseline_sse / sst)
    baseline_rmsea = calculate_rmsea(y_test, baseline)
    results.loc['Baseline', :] = [baseline_mae, baseline_rmse, baseline_r2, baseline_rmsea]
    
    return results

In [27]:
results = evaluate(X_train, X_test, y_train, y_test)
print(results)

                             mae       rmse        r2     rmsea
Linear Regression       5.011523  11.712783  0.861423   0.62165
ElasticNet Regression   5.017783  11.714901  0.861373  0.621762
Random Forest              4.675  11.251693  0.872119  0.597178
Extra Trees                 4.56  10.846138  0.881172  0.575653
SVM                    11.167395  31.278561  0.011759  1.660094
Gradient Boosted        4.564793  10.936817  0.879177  0.580466
Baseline               20.963483   33.15477 -0.110354  1.759672
