In [1]:
import warnings
warnings.simplefilter('ignore')

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pylab as pl
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge, ARDRegression, Lasso, Ridge
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [15]:
df = pd.read_csv('../datasets/SEHIR/processed_dataset.csv')
df.head()

Unnamed: 0,Course Code,Course Title,Student Number,Department Code,Course Level,Letter Grade,Status,GPA,Standing,Completed Credits,Completed ECTS,GPA Student - Subject,Avg. Grade - Taken,Avg. Grade - Students_Subject,Semester,Theoritical,Practical,Course Credit,ECTS,Course Year
0,UNI 111,Critical Reading & Writing in Turkish I,240,SOC,Undergraduate,F,Unsuccessful,2.62,Freshman,18,30,2.616667,2.113636,2.703226,2011 - Fall,3,0,3,5,1
1,UNI 107,World Civilizations& Global Encounters I,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,30,3.683333,2.986364,2.703226,2011 - Fall,3,0,3,5,1
2,UNI 105,Understanding Society and Culture I,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,30,3.683333,3.211538,2.703226,2011 - Fall,3,0,3,5,1
3,UNI 103,Understanding Science and Technology,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,30,3.683333,3.176,2.703226,2011 - Fall,3,0,3,5,1
4,UNI 105,Understanding Society and Culture I,240,SOC,Undergraduate,A,Successful,2.62,Freshman,18,30,2.616667,3.211538,2.703226,2011 - Fall,3,0,3,5,1


In [16]:
df_raw = df.copy()

In [17]:
df.drop([df.columns[0], df.columns[1], df.columns[2]], inplace=True, axis=1)   # dropping course details

In [18]:
# applying one-hot encoding on categorical features
df = pd.concat([df, pd.get_dummies(df['Course Year'], prefix='Course Year'), pd.get_dummies(df['Department Code'], prefix='Department Code'), pd.get_dummies(df['Course Level'], prefix='Course Level'), pd.get_dummies(df['Standing'], prefix='Standing'), pd.get_dummies(df['Status'], prefix='Status')], axis=1)
df.drop(['Course Year', 'Department Code', 'Course Level', 'Status', 'Standing'], axis=1, inplace=True)

In [19]:
columns = df.columns

In [20]:
le = LabelEncoder()
le.fit(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F'])

In [7]:
def standardize(X_train, X_test):
    X_train_cols = X_train.columns
    X_test_cols = X_test.columns
    sc = StandardScaler()
    fitted_sc = sc.fit(X_train)
    X_train_std = pd.DataFrame(fitted_sc.transform(X_train), columns=X_train_cols)
    X_test_std = pd.DataFrame(fitted_sc.transform(X_test), columns=X_test_cols)
    return X_train_std, X_test_std

In [8]:
def get_train_data(df, train_sem, columns):
    dataFrame = pd.DataFrame(columns=columns)
    
    # extracting instances from the dataset which should be in training data
    for sem in train_sem:
        dataFrame = pd.concat([dataFrame, df[df.iloc[:, 7] == sem]], ignore_index=True)
    
    X_train = dataFrame.drop('Semester', axis=1)
    y_train = le.transform(X_train.pop('Letter Grade'))
    return X_train, y_train

In [9]:
def fit_cluster(n_clusters, X_train, y_train, reg_model, cluster_model):
    reg_models = {}   # {cluster_label: fitted regression model (object), ...}
    clusters_dataset = {}  # {cluster_label: {'X': dataset (pd.DataFrame), 'y': true target values (list)}, ...}
    one_hot_depts = list(X_train.columns[16:51])
    
    # fitting a clustering model based on GPA, Completed Credits and Departments
    fitted_cluster_model = cluster_model(n_clusters=n_clusters).fit(X_train[['GPA', 'Completed Credits'] + one_hot_depts])
    print(['GPA', 'Completed Credits'] + one_hot_depts)
    cluster_labels = fitted_cluster_model.labels_
    
    # splitting the main dataset into sub-dataFrames based on their cluster label
    for i in range(len(cluster_labels)):
        clusters_dataset.setdefault(cluster_labels[i], {})
        clusters_dataset[cluster_labels[i]].setdefault('X', pd.DataFrame(columns=X_train.columns))
        clusters_dataset[cluster_labels[i]].setdefault('y', [])

        df_row = list(X_train.iloc[i, :])   # getting the corresponding row from main dataset
        cluster_data = clusters_dataset[cluster_labels[i]]   # getting the corresponding cluster data structure
        cluster_data['X'].loc[len(cluster_data['X'])] = df_row
        cluster_data['y'].append(y_train[i])
    
    # fitting a regression model to each clustering and storing the fitted models
    for cluster_label in clusters_dataset:
        reg_models.setdefault(cluster_label, 0)
        fitted_reg_model = reg_model.fit(clusters_dataset[cluster_label]['X'], clusters_dataset[cluster_label]['y'])
        reg_models[cluster_label] = fitted_reg_model

    print(fitted_cluster_model)
    return reg_models, fitted_cluster_model

In [10]:
def cluster_test_data(cluster_model, X_test, y_test):
    test_dfs = {}   # splitting the test dataset into sub-dataFrames based on their predicted cluster label
    one_hot_depts = list(X_test.columns[16:51])
    # predicting the cluster labels of test data using a cluster model fitted on the whole dataset so far
    predicted_clusters = cluster_model.predict(X_test[['GPA', 'Completed Credits'] + one_hot_depts])
    
    # splitting the test dataset based on their cluster label
    for i in range(len(predicted_clusters)):
        test_dfs.setdefault(predicted_clusters[i], {})
        test_dfs[predicted_clusters[i]].setdefault('X', pd.DataFrame(columns=X_test.columns))
        test_dfs[predicted_clusters[i]].setdefault('y', [])
        
        df_row = list(X_test.iloc[i, :])
        cluster_data = test_dfs[predicted_clusters[i]]
        cluster_data['X'].loc[len(cluster_data['X'])] = df_row
        cluster_data['y'].append(y_test[i])
        
    return test_dfs

In [11]:
def get_error_score(df, columns, reg_model, cluster_model):
    error_scores = {}   # storing error scores in a dict with shape: 
                        # {num_clusters (k=2,3,...,7): 
                          # {num_training_semesters (N=1,2,...,7): 
                              # {cluster_label: [RMSE, MAE], ...}, 
                          # ...}, 
                        #...}

    sorted_semesters = sorted(set(df.iloc[:, 7]))   # sorting semesters in a time series manner
    for num_clusters in range(10, 31, 5):
        error_scores.setdefault(str(num_clusters), {})
        for sem_idx in range(1, len(sorted_semesters)):
            error_scores[str(num_clusters)].setdefault(str(sem_idx), {'y_true': [], 'y_pred': []})
            
            # preparing the training data from the beginning of the dataset so far
            training_sem = sorted_semesters[:sem_idx]
            test_sem = sorted_semesters[sem_idx]
            X_train, y_train = get_train_data(df, training_sem, columns)
            
            # preparing the test dataframe
            X_test = df[df.iloc[:, 7] == test_sem]
            X_test.drop('Semester', axis=1, inplace=True)
            X_test.index = range(len(X_test))
            y_test = le.transform(X_test.pop('Letter Grade'))
            
            # standardizing the dataset for faster optimization
            X_train, X_test = standardize(X_train, X_test)
            
            # getting the cluster model fitted on training data and each clusters' regression model in a dict
            reg_models, fitted_cluster_model = fit_cluster(num_clusters, X_train, y_train, reg_model, cluster_model)
            
            # getting the clustered test data
            test_dfs = cluster_test_data(fitted_cluster_model, X_test, y_test)
            
            # for each cluster label in test dataFrames:
                # 1. retrieve the regression model fitted earlier on the same cluster label
                # 2. predict the test data from the same cluster label with the retrieved regression model
                
            for cluster_label in test_dfs:
                fitted_reg_model = reg_models[cluster_label]
                y_true = test_dfs[cluster_label]['y']
                y_pred = fitted_reg_model.predict(test_dfs[cluster_label]['X'])
                y_pred = list(y_pred)
                                
                error_scores[str(num_clusters)][str(sem_idx)]['y_true'] += y_true
                error_scores[str(num_clusters)][str(sem_idx)]['y_pred'] += y_pred
                
    return error_scores

In [12]:
def plot_error_scores(scores, title):
    best_scores = {}
    for n_clusters in scores:
        best_scores.setdefault(n_clusters, [100, 100, -100, -100])   # [rmse_train, rmse_test, r2_train, r2_test]
        for n_training_sem in scores[n_clusters]:
            clust_scores = scores[n_clusters][n_training_sem].values()
            for s in clust_scores:
                if s['train'][0] < best_scores[n_clusters][0]:
                    best_scores[n_clusters][0] = s['train'][0]
                elif s['train'][1] > best_scores[n_clusters][2]:
                    best_scores[n_clusters][2] = s['train'][1]
                elif s['test'][0] < best_scores[n_clusters][1]:
                    best_scores[n_clusters][1] = s['test'][0]
                elif s['test'][1] > best_scores[n_clusters][3]:
                    best_scores[n_clusters][3] = s['test'][1]
        
    fig, ax = plt.subplots(figsize=(15,5))
    x = np.arange(2,8)
    ax.plot(x, [best_scores[k][0] for k in sorted(best_scores)], marker='o', label='RMSE train', linestyle='--', linewidth=3, mew=3)
    ax.plot(x, [best_scores[k][1] for k in sorted(best_scores)], marker='x', label='RMSE test', linestyle='-', linewidth=3, mew=3)
    ax.plot(x, [best_scores[k][2] for k in sorted(best_scores)], marker='v', label='R\u00b2 train', linestyle='dotted', linewidth=3, mew=3)
    ax.plot(x, [best_scores[k][3] for k in sorted(best_scores)], marker='d', label='R\u00b2 test', linestyle='dashdot', linewidth=3, mew=3)
    
    ax.set_title(title, fontfamily='serif', fontsize=20)
    ax.set_yticklabels([round(i,1) for i in ax.get_yticks()], fontfamily='serif', fontsize=20)
    ax.set_xticks(x)
    ax.set_xticklabels(['k={}'.format(i) for i in range(2, 8)], fontfamily='serif', fontsize=20)
    ax.set_xlabel('Number of Clusters (k)', fontsize=20, fontfamily='serif')
    ax.set_ylabel('Error', fontsize=20, fontfamily='serif')
    ax.grid(True)
    ax.legend(prop={'size': 20, 'family': 'serif'}, loc='center', bbox_to_anchor=(0.5, -0.25), ncol=4)
    
#     plt.savefig('sample.svg', bbox='tight')

In [13]:
with open('../hyperparameters/tuned_hyperparams (student based).json') as fr:
    tuned_hyperparams = json.load(fr)

In [29]:
model_errors = {}

In [31]:
with open('../results/clustering_student_based_regression_results (with KMeans).json') as fr:
    results = json.load(fr)

In [33]:
hps = tuned_hyperparams['SVR']
errors = get_error_score(df, columns, SVR(C=hps['C'], kernel=hps['kernel'], epsilon=hps['epsilon']), KMeans)
model_errors['SVR'] = errors

['GPA', 'Completed Credits', 'Department Code_BGM/NT', 'Department Code_BGM/T', 'Department Code_CS', 'Department Code_CTV', 'Department Code_CULT/T', 'Department Code_ECE/NT', 'Department Code_ECE/T', 'Department Code_EE', 'Department Code_HIST', 'Department Code_HIST PhD', 'Department Code_HIST/T', 'Department Code_HUK', 'Department Code_IE', 'Department Code_ISE/NT', 'Department Code_ISE/T', 'Department Code_ISS', 'Department Code_ITM', 'Department Code_KHUK/T', 'Department Code_LIT', 'Department Code_MBA/NT', 'Department Code_MGT', 'Department Code_MTS/T', 'Department Code_PHIL', 'Department Code_POLS', 'Department Code_POLS/NT', 'Department Code_POLS/T', 'Department Code_PSY', 'Department Code_SOC', 'Department Code_SOC/T', 'Department Code_ÖHUK PhD', 'Department Code_ÖHUK/NT', 'Department Code_ÖHUK/T', 'Department Code_İHP', 'Course Level_Graduate', 'Course Level_Undergraduate']
KMeans(n_clusters=10)
['GPA', 'Completed Credits', 'Department Code_BGM/NT', 'Department Code_BGM/T', 

KeyboardInterrupt: 

In [None]:
plot_error_scores(results['SVR'], 'SupportVectorRegressor')

In [None]:
hps = tuned_hyperparams['BayesianRidge']
errors = get_error_score(df, columns, BayesianRidge(lambda_1=hps['lambda_1'], lambda_2=hps['lambda_2'],
                                                    alpha_1=hps['alpha_1'], alpha_2=hps['alpha_2']), KMeans)
model_errors['BayesianRidge'] = errors

In [None]:
plot_error_scores(results['BayesianRidge'], 'BayesianRidge')

In [None]:
hps = tuned_hyperparams['Lasso']
errors = get_error_score(df, columns, Lasso(alpha=hps['alpha']), KMeans)
model_errors['Lasso'] = errors

In [None]:
plot_error_scores(results['Lasso'], 'Lasso')

In [None]:
hps = tuned_hyperparams['Ridge']
errors = get_error_score(df, columns, Ridge(alpha=hps['alpha']), KMeans)
model_errors['Ridge'] = errors

In [None]:
plot_error_scores(results['Ridge'], 'Ridge')

In [None]:
hps = tuned_hyperparams['BaggingRegressor']
errors = get_error_score(df, columns, BaggingRegressor(n_estimators=hps['n_estimators']), KMeans)
model_errors['BaggingRegressor'] = errors

In [None]:
plot_error_scores(results['BaggingRegressor'], 'BaggingRegressor')

In [None]:
hps = tuned_hyperparams['AdaBoostRegressor']
scores = get_error_score(df, columns, AdaBoostRegressor(n_estimators=hps['n_estimators'], learning_rate=hps['learning_rate']), KMeans)
model_errors['AdaBoostRegressor'] = errors

In [None]:
plot_error_scores(results['AdaBoostRegressor'], 'AdaBoostRegressor')

In [None]:
hps = tuned_hyperparams['RandomForestRegressor']
errors = get_error_score(df, columns, RandomForestRegressor(n_estimators=hps['n_estimators']), KMeans)
model_errors['RandomForestRegressor'] = errors

In [None]:
plot_error_scores(results['RandomForestRegressor'], 'RandomForestRegressor')

In [None]:
hps = tuned_hyperparams['GradientBoostingRegressor']
errors = get_error_score(df, columns, GradientBoostingRegressor(learning_rate=hps['learning_rate'], loss=hps['loss'],
                                                                n_estimators=hps['n_estimators'], max_depth=hps['max_depth']), KMeans)
model_errors['GradientBoostingRegressor'] = errors

In [None]:
plot_error_scores(results['GradientBoostingRegressor'], 'GradientBoostingRegressor')

In [None]:
with open('clustering_student_based_regression_results (with KMeans).json', 'w') as fw:
    json.dump(model_errors, fw)

### yeni: maskeli subset uzerinde calisma (cf ile regresyon karsilastirmasi icin)

In [21]:
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso # Hangisini kullanıyorsan
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler

# 1. MASKEYİ YÜKLE
with open('sota_subset_mask.json', 'r') as f:
    valid_mask = json.load(f)

def get_error_score_masked_regression(df_encoded, df_raw, columns, model, mask, apply_scaling=False):
    """
    Regresyon modelleri için Common Subset hata hesabı.
    """
    error_scores = {}
    
    # Semester sütun indeksi kontrolü (Genelde 7. sütun)
    sorted_semesters = sorted(set(df_encoded.iloc[:, 7])) 
    
    print(f"Regression ({type(model).__name__}) Subset Evaluation running...")
    
    for sem_idx in range(1, len(sorted_semesters)):
        training_sem = sorted_semesters[:sem_idx]
        test_sem = sorted_semesters[sem_idx]
        
        # --- TRAIN VERİSİ ---
        X_train, y_train = get_train_data(df_encoded, training_sem, columns)
        
        # --- TEST VERİSİ ---
        # 1. Kimlik kontrolü için raw veri
        mask_test_rows = df_encoded.iloc[:, 7] == test_sem
        test_df_raw = df_raw[mask_test_rows].copy()
        
        if len(test_df_raw) == 0: continue
            
        # 2. Model için encoded veri
        X_test = df_encoded.loc[mask_test_rows, columns].copy()
        
        # Sütun eşitleme (Gereksiz sütunları atar)
        if hasattr(X_train, 'columns'):
             X_test = X_test[X_train.columns]
        
        # Gerçek Notlar (Regresyon hedefi)
        # NOT: Eğer y_train 'Label Encoded' (0,1,2..) ise y_test de öyle olmalı.
        # Eğer y_train 'Scale' edilmiş notlar ise (0.0 - 4.0 arası), ona dikkat et.
        # Senin yapında 'Letter Grade' üzerinden le.transform kullanılıyordu:
        y_test_full = le.transform(test_df_raw['Letter Grade'])
        
        # --- SCALING (Opsiyonel ama Regresyon için genelde iyidir) ---
        if apply_scaling:
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)
            
        # --- MODEL EĞİTİMİ ve TAHMİN ---
        model.fit(X_train, y_train)
        y_pred_full = model.predict(X_test)
        
        # --- MASKELEME (Common Subset Filtresi) ---
        y_true_filtered = []
        y_pred_filtered = []
        
        test_df_raw = test_df_raw.reset_index(drop=True)
        
        for i, row in test_df_raw.iterrows():
            student_id = str(row['Student Number'])
            course_code = row['Course Code']
            
            is_valid = False
            
            # Sadece CF'in cevaplayabildiği soruları al
            if test_sem in mask:
                if student_id in mask[test_sem]:
                    if course_code in mask[test_sem][student_id]:
                        is_valid = True
            
            if is_valid:
                y_true_filtered.append(y_test_full[i])
                y_pred_filtered.append(y_pred_full[i])
        
        # --- METRİK HESABI ---
        if len(y_true_filtered) > 0:
            rmse_test = round(np.sqrt(mean_squared_error(y_true_filtered, y_pred_filtered)), 3)
            mae_test = round(mean_absolute_error(y_true_filtered, y_pred_filtered), 3)
        else:
            rmse_test, mae_test = 0, 0
            
        error_scores.setdefault(sem_idx, {})
        error_scores[sem_idx]['rmse'] = [0, rmse_test]
        error_scores[sem_idx]['mae'] = [0, mae_test]
        
        # print(f"Sem {sem_idx}: RMSE {rmse_test}") # İstersen aç

    return error_scores

# --- KAYDETME FONKSİYONU (Diğerleriyle aynı) ---
def save_subset_results(results_dict, model_name, filename):
    test_rmse_list = []
    test_mae_list = []
    
    for sem in results_dict:
        rmse_val = results_dict[sem]['rmse'][1]
        mae_val = results_dict[sem]['mae'][1]
        if rmse_val > 0:
            test_rmse_list.append(rmse_val)
            test_mae_list.append(mae_val)
    
    avg_rmse = np.mean(test_rmse_list)
    avg_mae = np.mean(test_mae_list)
    
    final_output = {
        "Model": model_name,
        "RMSE": float(avg_rmse),
        "MAE": float(avg_mae),
        "Type": "Common Subset Evaluation"
    }
    
    with open(filename, 'w') as f:
        json.dump(final_output, f, indent=4)
        
    print(f"✅ {model_name} Kaydedildi. RMSE: {avg_rmse:.4f}")

# --- ÇALIŞTIR ---
# Gradient için örnek:
reg_model = GradientBoostingRegressor() # Veya Ridge(), Lasso()

# df_encoded: Sayısal veri
# original_df: ID'li ham veri
# feature_cols: Kullanılacak sütunlar
results_reg = get_error_score_masked_regression(
    df, df_raw, columns, reg_model, valid_mask, apply_scaling=True
)

save_subset_results(results_reg, "Gradient Boosting (Subset)", "results_regression_subset.json")

Regression (GradientBoostingRegressor) Subset Evaluation running...
✅ Gradient Boosting (Subset) Kaydedildi. RMSE: 1.4671
