code added

CAHLR · Jul 16, 2018 · d9a28f5 · d9a28f5
1 parent aa009d3
commit d9a28f5
Show file tree

Hide file tree

Showing 19 changed files with 2,727 additions and 2 deletions.
diff --git a/AFM/afm_keras.py b/AFM/afm_keras.py
@@ -0,0 +1,67 @@
+from keras.models import Model, Sequential
+from keras.layers import Input, Dense
+from keras.callbacks import EarlyStopping
+from sklearn.metrics import mean_squared_error
+from math import sqrt
+import numpy as np
+np.random.seed(29)
+"""
+Input: X_train 2-D array
+       Y_train 1-D array
+"""
+
+class AFMK:
+
+    def __init__(self):
+
+        self.batch_size = 64
+        self.epochs = 500
+        self.validation_split = 0.2
+
+    def fit(self, X_train, Y_train):
+
+        model = Sequential()
+        model.add(Dense(1, input_dim=len(X_train[0]), activation='sigmoid'))
+        model.compile(optimizer='rmsprop',
+                      loss='binary_crossentropy',
+                      metrics=['accuracy'])
+
+        earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')
+        model.fit(X_train, Y_train, verbose=0, batch_size = self.batch_size, epochs=self.epochs, callbacks = [earlyStopping], validation_split = self.validation_split, shuffle = True)
+
+        y_val = model.predict(X_train)
+        index_one = 0
+        acc_y = np.array([float(y_val[i][index_one]) for i in range(len((y_val)))])
+        SSR = sum([(Y_train[i]-acc_y[i]) ** 2 for i in range(len(acc_y))])
+        N = len(Y_train)
+        s2 = SSR / float(N)
+        L =  ( N * np.log(1.0/np.sqrt(2*np.pi*s2)) - (1.0/(2*s2) )*SSR )
+        AIC = 2*(model.count_params())  - 2 * L
+        BIC = (model.count_params()) * np.log(N) - 2 * L        
+        return model, AIC, BIC
+
+    def predict(self, X_test, Y_test, model, d_t):
+
+        y_val = model.predict(X_test)
+        index_one = 0
+        acc_y = np.array([float(y_val[i][index_one]) for i in range(len((y_val)))])
+        rmse_avg = self.rmse_avg(model, acc_y, d_t, Y_test)
+        return rmse_avg        
+
+    def rmse_avg(self, model, acc_y, d_t, Y_test):
+
+        rmse = []
+        for dummy, l in d_t.items():
+            if len(l) == 0:
+                    continue
+            rmse.append(sqrt(mean_squared_error(Y_test[l], acc_y[l])))
+        return np.mean(rmse)  
+
+if __name__ == "__main__":
+
+    x = [[1,0,0,1], [1,0,0,0],[0,1,1,0], [1,0,1,0]]
+    y = [[1, 0],[0, 1],[1, 0]]
+    y = [0, 1, 0, 1]
+    obj = AFMK()
+    m = obj.fit(x,y)
+    print ("Model Fitted")
diff --git a/AFM/afm_liblinear.py b/AFM/afm_liblinear.py
@@ -0,0 +1,53 @@
+try:
+    from AFM.liblinear.python.liblinearutil import *
+except:
+    pass
+
+import numpy as np
+from sklearn.metrics import mean_squared_error
+from math import sqrt
+"""
+X_train: 2-D array
+Y_train: 1-D array
+"""
+class AFML:
+
+    def fit(self, X_train, Y_train):
+
+        m = train(problem(Y_train, X_train), '-s 6')
+        labels, p_acc, y_val = predict(Y_train, X_train, m, '-b 1')
+        index_one = list(m.get_labels()).index(1)
+        acc_y = np.array([float(y_val[i][index_one]) for i in range(len((y_val)))])
+        SSR = mean_squared_error(Y_train, acc_y)
+        N = len(Y_train)
+        s2 = SSR / float(N)
+        L =  ( N * np.log(1.0/np.sqrt(2*np.pi*s2)) - (1.0/(2*s2) )*SSR )
+        AIC = 2*(1+len(X_train[0]))  - 2 * L
+        BIC = (1+len(X_train[0])) * np.log(N) - 2 * L         
+        return m, AIC, BIC
+
+    def predict(self, X_test, Y_test, m, d_t):
+
+        labels, p_acc, y_val = predict(Y_test, X_test, m, '-b 1')
+        index_one = list(m.get_labels()).index(1)
+        acc_y = np.array([float(y_val[i][index_one]) for i in range(len((y_val)))])
+        rmse_avg = self.rmse_avg(m, acc_y, d_t, Y_test)
+        return rmse_avg
+
+    def rmse_avg(self, model, acc_y, d_t, Y_test):
+
+        rmse = []
+        for dummy, l in d_t.items():
+            if len(l) == 0:
+                    continue
+            rmse.append(sqrt(mean_squared_error(Y_test[l], acc_y[l])))
+        return np.mean(rmse)            
+
+if __name__ == "__main__":
+
+        x = [[1,0,0,1], [1,0,0,0],[0,1,1,0], [1,0,1,0]]
+        y = [[1, 0],[0, 1],[1, 0]]
+        y = [0, 1, 0, 1]
+        obj = AFMK()
+        m = obj.fit(x,y)
+        print ("Model Trained")
diff --git a/AFM/liblinear b/AFM/liblinear
diff --git a/AFM/load_data.py b/AFM/load_data.py
@@ -0,0 +1,149 @@
+"""
+Main function to be called for the prediction :-
+
+        train_predict using the given four below inputs.
+
+Modules used :-
+
+        __init__ :-
+                new_path: variable that stores current directory
+
+        read_load :-
+                f(): function used for the data
+
+Functions :-
+
+        one_hot(data) :- takes data as input and returns the one hot representation
+                         using student, skill, opportunity and correctness of that
+                         response.
+                         also returns the dictionary of userids and their index to
+                         separate test and training data from the data matrix.
+
+        train_afm() :- train the logistic regrssion model on liblinear and returns the model.
+
+                inputs :-
+                        X_train :- training data in one hot representation
+                        Y_train :- target of the training set
+                        model_disc - if model is saved on disc
+                                     (False / name of model)
+                        model_save - if want to save the model
+                                     (False / name of model to save)
+                output :-
+                        prints and returns the accuracy on training and testing data
+
+        load_data() :- loads the training and test set in the one hot representation format using
+                        one hot function in this module.
+
+                inputs :-
+                        dtype - (train/test/both) which data you wants
+                        data - data in terms of dataframe if different from original data.
+                        utype - ("" / "sub") user type on which model shuld be trained and tested
+
+                output :-
+                        return training rows, testing rows and the pandas dataframe as the train and test set of logistic regression
+
+        save_data() :-
+                        l, l1, X, fname, utype=""
+                        saves the training and testing data in tha Data/ directory with given fname
+
+        read_data() :-  fname, utype
+                        read and returns the training and testing data in tha Data/ directory with given fname
+
+        "make" should be executed in liblinear/python directory
+        liblinear should be in "__init__new_path + Data" directory
+
+"""
+
+import random
+import pandas as pd
+import numpy as np
+from collections import defaultdict
+import h5py
+import sys
+import os
+
+from datetime import datetime
+from dateutil.relativedelta import relativedelta
+
+def one_hot(data, d_u):
+
+    total_skills = []
+    skill_train = list(data["skill_name"])
+    multi_skills = 0
+    for skill in skill_train:
+        if "~~" in skill:
+            total_skills.extend(skill.split('~~'))
+            multi_skills += 1
+        else:
+            total_skills.append(skill)
+
+    total_skills = sorted(list(set(total_skills)))
+    user_ids = list(data["user_id"])
+    d = {j:i for i, j in enumerate(total_skills)}
+    u = {j:i for i, j in enumerate(sorted(list(set(user_ids))))}
+
+    skill_onehot = np.zeros([len(data), len(total_skills)])
+    opportunity_onehot = np.zeros([len(data), len(total_skills)])
+    Y = np.reshape(np.array(list(data["correct"])), (len(data), 1))
+    d_t = {}
+    s_t = {}
+    for i,j in d_u.items():
+        if d_u[i] == "test":
+            d_t[i] = []
+    for i, j in d.items():
+        s_t[i] = []
+
+    row = 0
+    counter = 0
+    opportunity = list(data["Opportunity"])
+    for skill, opp in zip(skill_train, opportunity):
+        for multi_skill, op in zip(skill.split('~~'), str(opp).split('~~')):
+            skill_onehot[row][d[multi_skill]] = 1
+            opportunity_onehot[row][d[multi_skill]] = int(op)
+            s_t[multi_skill].append(row)
+        if d_u[user_ids[row]] == 'test':
+            d_t[user_ids[row]].append(counter)
+            counter += 1
+        row += 1
+
+    l, l1 = [], []
+    for row in range(len(user_ids)):
+        if d_u[user_ids[row]] == "train":
+            l.append(row)
+        if d_u[user_ids[row]] == "test":
+            l1.append(row)
+    X_train = [skill_onehot[l, :], opportunity_onehot[l, :], Y[l, 0]]
+    X_test = [(d_t, s_t), skill_onehot[l1, :], opportunity_onehot[l1, :], Y[l1, 0]]
+    return X_train, X_test
+
+def load_data(data, user_train, user_test):
+
+    users = set(list(data["user_id"]))
+    user_train = sorted(list(users.intersection(set(user_train))))
+    user_test = sorted(list(users.intersection(set(user_test))))
+    print (len(user_test), len(user_train), len(users))
+    d_u = {j:'train' for j in list(user_train)}
+    for j in list(user_test):
+        d_u[j] = 'test'
+
+    X_train, X_test = one_hot(data[data["user_id"].isin(user_train+user_test)], d_u)
+    return X_train, X_test
+
+def save_hd5f(fname, dname, data):
+
+    print ("HDF5 Saving Started")
+    h5f = h5py.File(fname, 'w')
+    h5f.create_dataset(dname, data=data)
+    h5f.close()
+    print ("HDF5 Saving Done")
+
+def save_data(X_train, X_test, fname, utype=""):
+
+    pd.DataFrame(X_train).to_hdf(__init__.new_path + "Saved/Model/afm/"+fname+"."+utype+"train", "train")
+    pd.DataFrame(X_test).to_hdf(__init__.new_path + "Saved/Model/afm/"+fname+"."+utype+"test", "test")
+
+def read_data(fname, utype):
+
+    X_train = pd.read_hdf(__init__.new_path + "Data/afm/"+fname+"."+utype+"train", "train")
+    X_test = pd.read_hdf(__init__.new_path + "Data/afm/"+fname+"."+utype+"test", "test")
+    return X_train.as_matrix(), X_test.as_matrix()
diff --git a/DAFM/__pycache__/dafm.cpython-35.pyc b/DAFM/__pycache__/dafm.cpython-35.pyc
diff --git a/DAFM/__pycache__/load_data.cpython-35.pyc b/DAFM/__pycache__/load_data.cpython-35.pyc