In [1]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.special import softmax as scipy_softmax

In [4]:
# start java virtual machine in order to run weka
import weka.core.jvm as jvm
jvm.start(max_heap_size="2g")

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\arpack_combined.jar', 'C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\core.jar', 'C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\mtj.jar', 'C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\python-weka-wrapper.jar', 'C:\\Users\\casperrrr\\anaconda3\\Lib\\site-packages\\weka\\lib\\weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=2g
DEBUG:weka.core.jvm:Package support disabled


In [6]:
# weka imports
from weka.plot.classifiers import Classifier
from weka.core.converters import Loader

In [8]:
df = pd.read_csv('../datasets/SEHIR/processed_dataset.csv')
df

Unnamed: 0,Course Code,Course Title,Student Number,Department Code,Course Level,Letter Grade,Status,GPA,Standing,Completed Credits,Completed ECTS,GPA Student - Subject,Avg. Grade - Taken,Avg. Grade - Students_Subject,Semester,Theoritical,Practical,Course Credit,ECTS,Course Year
0,UNI 111,Critical Reading & Writing in Turkish I,240,SOC,Undergraduate,F,Unsuccessful,2.62,Freshman,18,30,2.616667,2.113636,2.703226,2011 - Fall,3,0,3,5,1
1,UNI 107,World Civilizations& Global Encounters I,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,30,3.683333,2.986364,2.703226,2011 - Fall,3,0,3,5,1
2,UNI 105,Understanding Society and Culture I,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,30,3.683333,3.211538,2.703226,2011 - Fall,3,0,3,5,1
3,UNI 103,Understanding Science and Technology,338,PSY,Undergraduate,A,Successful,3.68,Freshman,18,30,3.683333,3.176000,2.703226,2011 - Fall,3,0,3,5,1
4,UNI 105,Understanding Society and Culture I,240,SOC,Undergraduate,A,Successful,2.62,Freshman,18,30,2.616667,3.211538,2.703226,2011 - Fall,3,0,3,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48736,MGT 531,Stratejik Yönetim,1102,MBA/NT,Graduate,A-,Successful,3.80,Freshman,24,40,3.800000,3.391758,3.049526,2014 - Spring,3,0,3,5,5
48737,MGT 585,Operasyon Yönetimi,1102,MBA/NT,Graduate,A,Successful,3.80,Freshman,24,40,3.800000,3.440476,3.049526,2014 - Spring,3,0,3,5,5
48738,MGT 552,İnsan Kaynakları Yönetimi,1102,MBA/NT,Graduate,A,Successful,3.80,Freshman,24,40,3.800000,3.393077,3.049526,2014 - Spring,3,0,3,5,5
48739,MGT 574,Makroekonomik Göster. ve Pol. Anal.,1984,MBA/NT,Graduate,A-,Successful,3.20,Sophomore,24,40,3.200000,3.570769,3.049526,2014 - Spring,3,0,3,5,5


In [10]:
df_raw = df.copy()

In [12]:
df.drop([df.columns[0], df.columns[1], df.columns[2]], inplace=True, axis=1)   # dropping course details

In [14]:
# explicitly cast categorical features to category
categorical_cols = [
    'Course Year',
    'Department Code',
    'Course Level',
    'Standing',
    'Status'
]

for col in categorical_cols:
    df[col] = df[col].astype('category')

columns = df.columns

In [16]:
le = LabelEncoder()
le.fit(['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F'])

In [18]:
def get_train_data(df, train_sem, columns):
    dataFrame = pd.DataFrame(columns=columns)
    
    for sem in train_sem:
        dataFrame = pd.concat(
            [dataFrame, df[df.iloc[:, 11] == sem]],
            ignore_index=True
        )

    X = dataFrame.drop('Semester', axis=1)
    y = X.pop('Letter Grade')   # keep as categorical

    return X, y

Weka requires arff type, this function is explicitly for weka

In [21]:
def dataframe_to_arff(df, filename, relation_name="dataset"):
    with open(filename, "w") as f:
        f.write(f"@relation {relation_name}\n\n")

        for col in df.columns:
            if df[col].dtype.name == "category" or df[col].dtype == object:
                values = sorted(df[col].dropna().unique())
                values = [str(v).replace(" ", "_") for v in values]
                f.write(f"@attribute '{col}' {{{','.join(values)}}}\n")
            else:
                f.write(f"@attribute '{col}' numeric\n")

        f.write("\n@data\n")

        for _, row in df.iterrows():
            row_vals = []
            for v in row:
                if pd.isna(v):
                    row_vals.append("?")
                else:
                    row_vals.append(str(v).replace(" ", "_"))
            f.write(",".join(row_vals) + "\n")

# Neural Network Weka

In [24]:
def get_error_score_weka(df, columns):
    error_scores = {}
    sorted_semesters = sorted(set(df.iloc[:, 11]))

    for sem_idx in range(1, len(sorted_semesters)):
        train_sem = sorted_semesters[:sem_idx]
        test_sem  = sorted_semesters[sem_idx]
        print(f"train_sem: {train_sem}")
        print(f"test_sem: {test_sem}")

        # train data
        X_train, y_train = get_train_data(df, train_sem, columns)
        train_df = X_train.copy()
        train_df['Letter Grade'] = y_train.astype(str)

        # test data
        test_df = df[df.iloc[:, 11] == test_sem].copy()
        test_df.drop('Semester', axis=1, inplace=True)
        y_test = test_df.pop('Letter Grade')
        test_df['Letter Grade'] = y_test.astype(str)

        # convert to arff
        dataframe_to_arff(train_df, "train.arff", "train")
        dataframe_to_arff(test_df, "test.arff", "test")


        loader = Loader(classname="weka.core.converters.ArffLoader")

        train_data = loader.load_file("train.arff")
        train_data.class_is_last()
        
        test_data = loader.load_file("test.arff")
        test_data.class_is_last()

        
        mlp = Classifier(
            "weka.classifiers.functions.MultilayerPerceptron",
            options=["-L","0.3","-M","0.2","-N","5000","-H","1"]
        )
        
        mlp.build_classifier(train_data)
        
        # predictions
        y_pred_test = []
        for inst in test_data:
            pred_index = int(mlp.classify_instance(inst))
            # Convert index to class label string
            y_pred_test.append(test_data.class_attribute.value(pred_index))
        
        y_pred_train = []
        for inst in train_data:
            pred_index = int(mlp.classify_instance(inst))
            y_pred_train.append(train_data.class_attribute.value(pred_index))


        # Encode strings back if needed (same le used during ARFF preparation)
        y_test_enc = le.transform(y_test)
        y_pred_test_enc = le.transform(y_pred_test)

        y_train_enc = le.transform(y_train)
        y_pred_train_enc = le.transform(y_pred_train)

        rmse_test = round(
            np.sqrt(mean_squared_error(y_test_enc, y_pred_test_enc)), 3
        )
        mae_test = round(
            mean_absolute_error(y_test_enc, y_pred_test_enc), 3
        )

        rmse_train = round(
            np.sqrt(mean_squared_error(y_train_enc, y_pred_train_enc)), 3
        )
        mae_train = round(
            mean_absolute_error(y_train_enc, y_pred_train_enc), 3
        )

        error_scores[sem_idx] = {
            "rmse": [rmse_train, rmse_test],
            "mae":  [mae_train, mae_test]
        }

        print(f"Semester {sem_idx}: RMSE(test)={rmse_test}, MAE(test)={mae_test}")

    return error_scores

In [111]:
error_scores = get_error_score_weka(df, columns)

train_sem: ['2011 - Fall']
test_sem: 2011 - Spring


  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(


Semester 1: RMSE(test)=2.608, MAE(test)=1.941
train_sem: ['2011 - Fall', '2011 - Spring']
test_sem: 2012 - Fall


  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(


Semester 2: RMSE(test)=2.292, MAE(test)=1.668
train_sem: ['2011 - Fall', '2011 - Spring', '2012 - Fall']
test_sem: 2012 - Spring


  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(


Semester 3: RMSE(test)=2.869, MAE(test)=2.164
train_sem: ['2011 - Fall', '2011 - Spring', '2012 - Fall', '2012 - Spring']
test_sem: 2013 - Fall


  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(


Semester 4: RMSE(test)=3.083, MAE(test)=2.298
train_sem: ['2011 - Fall', '2011 - Spring', '2012 - Fall', '2012 - Spring', '2013 - Fall']
test_sem: 2013 - Spring


  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(


Semester 5: RMSE(test)=7.706, MAE(test)=6.743
train_sem: ['2011 - Fall', '2011 - Spring', '2012 - Fall', '2012 - Spring', '2013 - Fall', '2013 - Spring']
test_sem: 2014 - Fall


  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(


Semester 6: RMSE(test)=7.732, MAE(test)=6.942
train_sem: ['2011 - Fall', '2011 - Spring', '2012 - Fall', '2012 - Spring', '2013 - Fall', '2013 - Spring', '2014 - Fall']
test_sem: 2014 - Spring


  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(
  dataFrame = pd.concat(


Semester 7: RMSE(test)=8.165, MAE(test)=7.591


In [113]:
error_scores

{1: {'rmse': [2.246, 2.608], 'mae': [1.596, 1.941]},
 2: {'rmse': [2.087, 2.292], 'mae': [1.536, 1.668]},
 3: {'rmse': [2.672, 2.869], 'mae': [1.991, 2.164]},
 4: {'rmse': [2.791, 3.083], 'mae': [2.071, 2.298]},
 5: {'rmse': [2.178, 7.706], 'mae': [1.598, 6.743]},
 6: {'rmse': [2.271, 7.732], 'mae': [1.611, 6.942]},
 7: {'rmse': [3.644, 8.165], 'mae': [2.773, 7.591]}}

In [117]:
scores = {'NN': error_scores}

In [119]:
with open('nn_results.json', 'w') as fw:
    json.dump(scores, fw)

In [28]:
jvm.stop()