## 使用整个数据集预测成绩
形式语言与编译

In [3]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.metrics import median_absolute_error, mean_squared_error, r2_score, median_absolute_error

import scipy

data = pd.read_csv('cs15_22.csv')

#划分训练-测试集，测试集比例为0.25
X_train, X_test, y_train, y_test = train_test_split(data, data['形式语言与编译'], test_size = 0.25, random_state=42)

In [4]:
import numpy as np
# Calculate mae and rmse
def evaluate_predictions(predictions, true):
    mae = np.mean(abs(predictions - true))
    rmse = np.sqrt(np.mean((predictions - true) ** 2))

    return mae, rmse

In [5]:
median_pred = X_train['形式语言与编译'].median()

median_preds = [median_pred for _ in range(len(X_test))]

true = X_test['形式语言与编译']

In [6]:
# Display the naive baseline metrics
mb_mae, mb_rmse = evaluate_predictions(median_preds, true)
print('Median Baseline  MAE: {:.4f}'.format(mb_mae))
print('Median Baseline RMSE: {:.4f}'.format(mb_rmse))

Median Baseline  MAE: 29.4298
Median Baseline RMSE: 39.8371


In [7]:
def calculate_rmsea(y_test, predictions):
    # Calculate the root mean square error of approximation (RMSEA)
    residuals = predictions - y_test
    chi_squared = np.sum(residuals ** 2)  # Chi-squared statistic
    degrees_of_freedom = len(y_test)  # Degrees of freedom
    n = len(y_test)  # Number of data points
    rmsea = np.sqrt(chi_squared / (degrees_of_freedom * (n - 1)))
    return rmsea

def evaluate(X_train, X_test, y_train, y_test):
    # Names of models
    model_name_list = ['Linear Regression', 'ElasticNet Regression',
                      'Random Forest', 'Extra Trees', 'SVM',
                       'Gradient Boosted', 'Baseline']
    X_train = X_train.drop('形式语言与编译', axis='columns')
    X_test = X_test.drop('形式语言与编译', axis='columns')
    
    # Instantiate the models
    model1 = LinearRegression()
    model2 = ElasticNet(alpha=1.0, l1_ratio=0.5)
    model3 = RandomForestRegressor(n_estimators=100)
    model4 = ExtraTreesRegressor(n_estimators=100)
    model5 = SVR(kernel='rbf', degree=3, C=1.0, gamma='auto')
    model6 = GradientBoostingRegressor(n_estimators=50)
    
    # Dataframe for results
    results = pd.DataFrame(columns=['mae', 'rmse', 'r2', 'rmsea'], index=model_name_list)

    # Train and predict with each model
    for i, model in enumerate([model1, model2, model3, model4, model5, model6]):
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        
        # Metrics
        mae = median_absolute_error(y_test, predictions)
        rmse = np.sqrt(mean_squared_error(y_test, predictions))
        r2 = r2_score(y_test, predictions)
        rmsea = calculate_rmsea(y_test, predictions)
        # Insert results into the dataframe
        model_name = model_name_list[i]
        results.loc[model_name, :] = [mae, rmse, r2, rmsea]
    
    # Median Value Baseline Metrics
    baseline = np.median(y_train)
    baseline_mae = np.mean(abs(baseline - y_test))
    baseline_rmse = np.sqrt(np.mean((baseline - y_test) ** 2))
    baseline_residuals = y_test - baseline
    baseline_sse = np.sum(baseline_residuals ** 2)
    y_mean = np.mean(y_test)
    y_diff = y_test - y_mean
    sst = np.sum(y_diff ** 2)
    baseline_r2 = 1 - (baseline_sse / sst)
    baseline_rmsea = calculate_rmsea(y_test, baseline)
    results.loc['Baseline', :] = [baseline_mae, baseline_rmse, baseline_r2, baseline_rmsea]
    
    return results

In [8]:
results = evaluate(X_train, X_test, y_train, y_test)
print(results)

                             mae       rmse        r2     rmsea
Linear Regression       4.282028  14.925026   0.82932  0.792138
ElasticNet Regression   4.224681  14.882781  0.830285  0.789896
Random Forest              3.115  15.716339  0.810741  0.834137
Extra Trees                3.325  13.771957  0.854674   0.73094
SVM                    15.573684  40.001706 -0.226055   2.12307
Gradient Boosted        2.980118  15.294529  0.820764  0.811749
Baseline               29.429775  39.837063 -0.215983  2.114332
