In [1]:
import warnings
warnings.simplefilter('ignore')
import json
import pandas as pd
import numpy as np

In [3]:
import weka.core.jvm as jvm
jvm.start()

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\arpack_combined.jar', 'C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\core.jar', 'C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\mtj.jar', 'C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\python-weka-wrapper.jar', 'C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


In [5]:
# weka imports
from weka.core.converters import Loader
from weka.core.dataset import Instances, Attribute, Instance
import weka.core.dataset as weka_dataset
from weka.classifiers import Classifier, Evaluation
from weka.filters import Filter
import weka.core.serialization as serialization

In [7]:
df = pd.read_csv('../datasets/SEHIR/processed_dataset.csv')
df

Unnamed: 0,Course Code,Course Title,Student Number,Department Code,Course Level,Letter Grade,Status,GPA,Standing,Completed Credits,Completed ECTS,GPA Student - Subject,Avg. Grade - Taken,Avg. Grade - Students_Subject,Semester,Theoritical,Practical,Course Credit,ECTS,Course Year
0,UNI 111,Critical Reading & Writing in Turkish I,240,SOC,Undergraduate,F,Unsuccessful,2.62,Freshman,18,30,2.616667,2.113636,2.703226,2011 - Fall,3,0,3,5,1
1,UNI 107,World Civilizations& Global Encounters I,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,30,3.683333,2.986364,2.703226,2011 - Fall,3,0,3,5,1
2,UNI 105,Understanding Society and Culture I,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,30,3.683333,3.211538,2.703226,2011 - Fall,3,0,3,5,1
3,UNI 103,Understanding Science and Technology,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,30,3.683333,3.176000,2.703226,2011 - Fall,3,0,3,5,1
4,UNI 105,Understanding Society and Culture I,240,SOC,Undergraduate,A,Successful,2.62,Freshman,18,30,2.616667,3.211538,2.703226,2011 - Fall,3,0,3,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48736,MGT 531,Stratejik Yönetim,1102,MBA/NT,Graduate,A-,Successful,3.80,Freshman,24,40,3.800000,3.391758,3.049526,2014 - Spring,3,0,3,5,5
48737,MGT 585,Operasyon Yönetimi,1102,MBA/NT,Graduate,A,Successful,3.80,Freshman,24,40,3.800000,3.440476,3.049526,2014 - Spring,3,0,3,5,5
48738,MGT 552,İnsan Kaynakları Yönetimi,1102,MBA/NT,Graduate,A,Successful,3.80,Freshman,24,40,3.800000,3.393077,3.049526,2014 - Spring,3,0,3,5,5
48739,MGT 574,Makroekonomik Göster. ve Pol. Anal.,1984,MBA/NT,Graduate,A-,Successful,3.20,Sophomore,24,40,3.200000,3.570769,3.049526,2014 - Spring,3,0,3,5,5


In [9]:
df_raw = df.copy()

In [11]:
# Drop course details (first 3 columns)
df.drop([df.columns[0], df.columns[1], df.columns[2]], inplace=True, axis=1)

In [13]:
letter_grades = df['Letter Grade'].copy()

In [15]:
# Store semester column before processing
semester_col = df['Semester'].copy()

In [17]:
# Drop Letter Grade temporarily to avoid it being one-hot encoded
df_temp = df.drop(['Letter Grade', 'Semester'], axis=1)

In [19]:
# One-hot encode categorical variables
df_encoded = pd.concat([
    df_temp, 
    pd.get_dummies(df['Course Year'], prefix='Course Year'),
    pd.get_dummies(df['Department Code'], prefix='Department Code'),
    pd.get_dummies(df['Course Level'], prefix='Course Level'),
    pd.get_dummies(df['Standing'], prefix='Standing'),
    pd.get_dummies(df['Status'], prefix='Status')
], axis=1)

In [21]:
# Drop original categorical columns
df_encoded.drop(['Course Year', 'Department Code', 'Course Level', 'Status', 'Standing'], 
                axis=1, inplace=True)

In [23]:
# Add semester and letter grade back
df_encoded['Letter Grade'] = letter_grades
df_encoded['Semester'] = semester_col

In [25]:
df = df_encoded

In [27]:
columns = df.columns

In [29]:
# Define grade ordering
grade_order = ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F']
grade_to_numeric = {grade: idx for idx, grade in enumerate(grade_order)}
numeric_to_grade = {idx: grade for idx, grade in enumerate(grade_order)}

In [31]:
def create_weka_attributes(df_columns, class_attribute="Letter Grade", use_numeric_for_binary=False):
    """
    Parameters:
    - use_numeric_for_binary: If True, one-hot encoded columns become numeric (for NaiveBayesMultinomial)
                              If False, they stay nominal (for regular NaiveBayes)
    """
    attributes = []
    
    for col in df_columns:
        if col == class_attribute:
            att = weka_dataset.Attribute.create_nominal(col, grade_order)
        elif col == 'Semester':
            continue
        else:
            # Check if this looks like a one-hot encoded column
            if any(prefix in col for prefix in ['Course Year_', 'Department Code_', 
                                                  'Course Level_', 'Standing_', 'Status_']):
                if use_numeric_for_binary:
                    # For NaiveBayesMultinomial: use numeric instead of binary nominal
                    att = weka_dataset.Attribute.create_numeric(col)
                else:
                    # For regular NaiveBayes: use nominal
                    att = weka_dataset.Attribute.create_nominal(col, ['0', '1'])
            else:
                att = weka_dataset.Attribute.create_numeric(col)
        
        attributes.append(att)
    
    return attributes

In [33]:
def pandas_to_weka_with_schema(df, attributes, dataset_name="dataset", class_attribute="Letter Grade"):
    """
    Convert pandas DataFrame to Weka Instances using predefined attributes.
    """
    dataset = weka_dataset.Instances.create_instances(dataset_name, attributes, len(df))
    
    for idx, row in df.iterrows():
        values = []
        for att in attributes:
            col_name = att.name
            
            if col_name not in df.columns:
                if att.is_nominal:
                    values.append(0)
                else:
                    values.append(0.0)
                continue
            
            val = row[col_name]
            
            if att.is_nominal:
                if col_name == class_attribute:
                    # For class attribute, use the actual grade string
                    str_val = str(val).strip()
                    if str_val in grade_order:
                        values.append(att.index_of(str_val))
                    else:
                        print(f"Warning: Unknown grade '{str_val}', defaulting to F")
                        values.append(att.index_of('F'))
                else:
                    # For one-hot encoded or other nominal
                    str_val = str(int(val)) if isinstance(val, (int, float)) else str(val)
                    idx_val = att.index_of(str_val)
                    if idx_val == -1:
                        idx_val = 0
                    values.append(idx_val)
            else:
                values.append(float(val))
        
        inst = weka_dataset.Instance.create_instance(values)
        dataset.add_instance(inst)
    
    return dataset

In [35]:
def get_train_data(df, train_sem):
    dataframes = []
    for sem in train_sem:
        dataframes.append(df[df['Semester'] == sem])
    
    if dataframes:
        dataFrame = pd.concat(dataframes, ignore_index=True)
    else:
        dataFrame = pd.DataFrame()
    
    return dataFrame

In [37]:
def get_error_score(df, classifier_name):
    error_scores = {}
    sorted_semesters = sorted(set(df['Semester']))
    
    # Detect if we're using NaiveBayesMultinomial
    is_multinomial = 'NaiveBayesMultinomial' in classifier_name
    
    # Get all columns except Semester
    all_columns = [col for col in df.columns if col != 'Semester']
    
    # Create attributes with appropriate types based on classifier
    attributes = create_weka_attributes(all_columns, "Letter Grade", 
                                       use_numeric_for_binary=is_multinomial)
    
    for sem_idx in range(1, len(sorted_semesters)):
        training_sem = sorted_semesters[:sem_idx]
        test_sem = sorted_semesters[sem_idx]
        
        # Get training data
        train_df = get_train_data(df, training_sem)
        train_df_for_weka = train_df.drop('Semester', axis=1)
        
        # Get test data
        test_df = df[df['Semester'] == test_sem].copy()
        test_df_for_weka = test_df.drop('Semester', axis=1)
        
        # Ensure both have the same columns
        train_df_for_weka = train_df_for_weka.reindex(columns=all_columns, fill_value=0)
        test_df_for_weka = test_df_for_weka.reindex(columns=all_columns, fill_value=0)
        
        # Convert to Weka format
        train_data = pandas_to_weka_with_schema(train_df_for_weka, attributes, 
                                                 "train_data", "Letter Grade")
        test_data = pandas_to_weka_with_schema(test_df_for_weka, attributes, 
                                               "test_data", "Letter Grade")
        
        # Set class index
        train_data.class_is_last()
        test_data.class_is_last()
        
        
        # Build classifier
        classifier = Classifier(classname=classifier_name)
        classifier.build_classifier(train_data)
        
        # Predictions for training set
        y_train_actual = []
        y_train_pred = []
        for i in range(train_data.num_instances):
            inst = train_data.get_instance(i)
            actual = int(inst.get_value(train_data.class_index))
            pred = int(classifier.classify_instance(inst))
            y_train_actual.append(actual)
            y_train_pred.append(pred)
        
        # Predictions for test set
        y_test_actual = []
        y_test_pred = []
        for i in range(test_data.num_instances):
            inst = test_data.get_instance(i)
            actual = int(inst.get_value(test_data.class_index))
            pred = int(classifier.classify_instance(inst))
            y_test_actual.append(actual)
            y_test_pred.append(pred)
        
        # Calculate metrics
        y_train_actual = np.array(y_train_actual)
        y_train_pred = np.array(y_train_pred)
        y_test_actual = np.array(y_test_actual)
        y_test_pred = np.array(y_test_pred)
        
        rmse_train = round(np.sqrt(np.mean((y_train_actual - y_train_pred) ** 2)), 3)
        mae_train = round(np.mean(np.abs(y_train_actual - y_train_pred)), 3)
        rmse_test = round(np.sqrt(np.mean((y_test_actual - y_test_pred) ** 2)), 3)
        mae_test = round(np.mean(np.abs(y_test_actual - y_test_pred)), 3)
        
        print(f"\nResults:")
        print(f"Train RMSE: {rmse_train}, Train MAE: {mae_train}")
        print(f"Test RMSE: {rmse_test}, Test MAE: {mae_test}")
        
        error_scores.setdefault(sem_idx, {})
        error_scores[sem_idx]['rmse'] = [rmse_train, rmse_test]
        error_scores[sem_idx]['mae'] = [mae_train, mae_test]
    
    return error_scores

In [39]:
print("Running Naive Bayes Classifier")
error_scores_gaussian = get_error_score(df, "weka.classifiers.bayes.NaiveBayes")

Running Naive Bayes Classifier

Results:
Train RMSE: 2.109, Train MAE: 1.439
Test RMSE: 2.248, Test MAE: 1.597

Results:
Train RMSE: 1.942, Train MAE: 1.354
Test RMSE: 2.186, Test MAE: 1.579

Results:
Train RMSE: 1.97, Train MAE: 1.381
Test RMSE: 2.227, Test MAE: 1.595

Results:
Train RMSE: 2.038, Train MAE: 1.433
Test RMSE: 2.24, Test MAE: 1.608

Results:
Train RMSE: 2.038, Train MAE: 1.427
Test RMSE: 2.353, Test MAE: 1.661

Results:
Train RMSE: 2.064, Train MAE: 1.445
Test RMSE: 2.276, Test MAE: 1.623

Results:
Train RMSE: 2.054, Train MAE: 1.442
Test RMSE: 2.346, Test MAE: 1.696


In [41]:
model_results = {}
model_results['GaussianNB'] = error_scores_gaussian
model_results

{'GaussianNB': {1: {'rmse': [2.109, 2.248], 'mae': [1.439, 1.597]},
  2: {'rmse': [1.942, 2.186], 'mae': [1.354, 1.579]},
  3: {'rmse': [1.97, 2.227], 'mae': [1.381, 1.595]},
  4: {'rmse': [2.038, 2.24], 'mae': [1.433, 1.608]},
  5: {'rmse': [2.038, 2.353], 'mae': [1.427, 1.661]},
  6: {'rmse': [2.064, 2.276], 'mae': [1.445, 1.623]},
  7: {'rmse': [2.054, 2.346], 'mae': [1.442, 1.696]}}}

In [43]:
print("Running Multinomial Naive Bayes Classifier")
error_scores_multinomial = get_error_score(df, "weka.classifiers.bayes.NaiveBayesMultinomial")

Running Multinomial Naive Bayes Classifier

Results:
Train RMSE: 2.514, Train MAE: 1.686
Test RMSE: 3.284, Test MAE: 2.38

Results:
Train RMSE: 2.561, Train MAE: 1.773
Test RMSE: 3.17, Test MAE: 2.246

Results:
Train RMSE: 2.716, Train MAE: 1.907
Test RMSE: 3.329, Test MAE: 2.367

Results:
Train RMSE: 2.745, Train MAE: 1.962
Test RMSE: 3.259, Test MAE: 2.36

Results:
Train RMSE: 2.926, Train MAE: 2.09
Test RMSE: 3.457, Test MAE: 2.499

Results:
Train RMSE: 3.007, Train MAE: 2.172
Test RMSE: 3.421, Test MAE: 2.566

Results:
Train RMSE: 2.912, Train MAE: 2.101
Test RMSE: 3.404, Test MAE: 2.534


In [45]:
model_results['MultinomialNB'] = error_scores_multinomial
model_results

{'GaussianNB': {1: {'rmse': [2.109, 2.248], 'mae': [1.439, 1.597]},
  2: {'rmse': [1.942, 2.186], 'mae': [1.354, 1.579]},
  3: {'rmse': [1.97, 2.227], 'mae': [1.381, 1.595]},
  4: {'rmse': [2.038, 2.24], 'mae': [1.433, 1.608]},
  5: {'rmse': [2.038, 2.353], 'mae': [1.427, 1.661]},
  6: {'rmse': [2.064, 2.276], 'mae': [1.445, 1.623]},
  7: {'rmse': [2.054, 2.346], 'mae': [1.442, 1.696]}},
 'MultinomialNB': {1: {'rmse': [2.514, 3.284], 'mae': [1.686, 2.38]},
  2: {'rmse': [2.561, 3.17], 'mae': [1.773, 2.246]},
  3: {'rmse': [2.716, 3.329], 'mae': [1.907, 2.367]},
  4: {'rmse': [2.745, 3.259], 'mae': [1.962, 2.36]},
  5: {'rmse': [2.926, 3.457], 'mae': [2.09, 2.499]},
  6: {'rmse': [3.007, 3.421], 'mae': [2.172, 2.566]},
  7: {'rmse': [2.912, 3.404], 'mae': [2.101, 2.534]}}}

In [149]:
with open('naive_bayes_results.json', 'w') as fw:
    json.dump(model_results, fw)