Skip to content

Commit

Permalink
code added
Browse files Browse the repository at this point in the history
  • Loading branch information
anant-dadu committed Jul 16, 2018
1 parent aa009d3 commit d9a28f5
Show file tree
Hide file tree
Showing 19 changed files with 2,727 additions and 2 deletions.
67 changes: 67 additions & 0 deletions AFM/afm_keras.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from keras.models import Model, Sequential
from keras.layers import Input, Dense
from keras.callbacks import EarlyStopping
from sklearn.metrics import mean_squared_error
from math import sqrt
import numpy as np
np.random.seed(29)
"""
Input: X_train 2-D array
Y_train 1-D array
"""

class AFMK:

def __init__(self):

self.batch_size = 64
self.epochs = 500
self.validation_split = 0.2

def fit(self, X_train, Y_train):

model = Sequential()
model.add(Dense(1, input_dim=len(X_train[0]), activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['accuracy'])

earlyStopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='auto')
model.fit(X_train, Y_train, verbose=0, batch_size = self.batch_size, epochs=self.epochs, callbacks = [earlyStopping], validation_split = self.validation_split, shuffle = True)

y_val = model.predict(X_train)
index_one = 0
acc_y = np.array([float(y_val[i][index_one]) for i in range(len((y_val)))])
SSR = sum([(Y_train[i]-acc_y[i]) ** 2 for i in range(len(acc_y))])
N = len(Y_train)
s2 = SSR / float(N)
L = ( N * np.log(1.0/np.sqrt(2*np.pi*s2)) - (1.0/(2*s2) )*SSR )
AIC = 2*(model.count_params()) - 2 * L
BIC = (model.count_params()) * np.log(N) - 2 * L
return model, AIC, BIC

def predict(self, X_test, Y_test, model, d_t):

y_val = model.predict(X_test)
index_one = 0
acc_y = np.array([float(y_val[i][index_one]) for i in range(len((y_val)))])
rmse_avg = self.rmse_avg(model, acc_y, d_t, Y_test)
return rmse_avg

def rmse_avg(self, model, acc_y, d_t, Y_test):

rmse = []
for dummy, l in d_t.items():
if len(l) == 0:
continue
rmse.append(sqrt(mean_squared_error(Y_test[l], acc_y[l])))
return np.mean(rmse)

if __name__ == "__main__":

x = [[1,0,0,1], [1,0,0,0],[0,1,1,0], [1,0,1,0]]
y = [[1, 0],[0, 1],[1, 0]]
y = [0, 1, 0, 1]
obj = AFMK()
m = obj.fit(x,y)
print ("Model Fitted")
53 changes: 53 additions & 0 deletions AFM/afm_liblinear.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
try:
from AFM.liblinear.python.liblinearutil import *
except:
pass

import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
"""
X_train: 2-D array
Y_train: 1-D array
"""
class AFML:

def fit(self, X_train, Y_train):

m = train(problem(Y_train, X_train), '-s 6')
labels, p_acc, y_val = predict(Y_train, X_train, m, '-b 1')
index_one = list(m.get_labels()).index(1)
acc_y = np.array([float(y_val[i][index_one]) for i in range(len((y_val)))])
SSR = mean_squared_error(Y_train, acc_y)
N = len(Y_train)
s2 = SSR / float(N)
L = ( N * np.log(1.0/np.sqrt(2*np.pi*s2)) - (1.0/(2*s2) )*SSR )
AIC = 2*(1+len(X_train[0])) - 2 * L
BIC = (1+len(X_train[0])) * np.log(N) - 2 * L
return m, AIC, BIC

def predict(self, X_test, Y_test, m, d_t):

labels, p_acc, y_val = predict(Y_test, X_test, m, '-b 1')
index_one = list(m.get_labels()).index(1)
acc_y = np.array([float(y_val[i][index_one]) for i in range(len((y_val)))])
rmse_avg = self.rmse_avg(m, acc_y, d_t, Y_test)
return rmse_avg

def rmse_avg(self, model, acc_y, d_t, Y_test):

rmse = []
for dummy, l in d_t.items():
if len(l) == 0:
continue
rmse.append(sqrt(mean_squared_error(Y_test[l], acc_y[l])))
return np.mean(rmse)

if __name__ == "__main__":

x = [[1,0,0,1], [1,0,0,0],[0,1,1,0], [1,0,1,0]]
y = [[1, 0],[0, 1],[1, 0]]
y = [0, 1, 0, 1]
obj = AFMK()
m = obj.fit(x,y)
print ("Model Trained")
1 change: 1 addition & 0 deletions AFM/liblinear
Submodule liblinear added at 67f274
149 changes: 149 additions & 0 deletions AFM/load_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""
Main function to be called for the prediction :-
train_predict using the given four below inputs.
Modules used :-
__init__ :-
new_path: variable that stores current directory
read_load :-
f(): function used for the data
Functions :-
one_hot(data) :- takes data as input and returns the one hot representation
using student, skill, opportunity and correctness of that
response.
also returns the dictionary of userids and their index to
separate test and training data from the data matrix.
train_afm() :- train the logistic regrssion model on liblinear and returns the model.
inputs :-
X_train :- training data in one hot representation
Y_train :- target of the training set
model_disc - if model is saved on disc
(False / name of model)
model_save - if want to save the model
(False / name of model to save)
output :-
prints and returns the accuracy on training and testing data
load_data() :- loads the training and test set in the one hot representation format using
one hot function in this module.
inputs :-
dtype - (train/test/both) which data you wants
data - data in terms of dataframe if different from original data.
utype - ("" / "sub") user type on which model shuld be trained and tested
output :-
return training rows, testing rows and the pandas dataframe as the train and test set of logistic regression
save_data() :-
l, l1, X, fname, utype=""
saves the training and testing data in tha Data/ directory with given fname
read_data() :- fname, utype
read and returns the training and testing data in tha Data/ directory with given fname
"make" should be executed in liblinear/python directory
liblinear should be in "__init__new_path + Data" directory
"""

import random
import pandas as pd
import numpy as np
from collections import defaultdict
import h5py
import sys
import os

from datetime import datetime
from dateutil.relativedelta import relativedelta

def one_hot(data, d_u):

total_skills = []
skill_train = list(data["skill_name"])
multi_skills = 0
for skill in skill_train:
if "~~" in skill:
total_skills.extend(skill.split('~~'))
multi_skills += 1
else:
total_skills.append(skill)

total_skills = sorted(list(set(total_skills)))
user_ids = list(data["user_id"])
d = {j:i for i, j in enumerate(total_skills)}
u = {j:i for i, j in enumerate(sorted(list(set(user_ids))))}

skill_onehot = np.zeros([len(data), len(total_skills)])
opportunity_onehot = np.zeros([len(data), len(total_skills)])
Y = np.reshape(np.array(list(data["correct"])), (len(data), 1))
d_t = {}
s_t = {}
for i,j in d_u.items():
if d_u[i] == "test":
d_t[i] = []
for i, j in d.items():
s_t[i] = []

row = 0
counter = 0
opportunity = list(data["Opportunity"])
for skill, opp in zip(skill_train, opportunity):
for multi_skill, op in zip(skill.split('~~'), str(opp).split('~~')):
skill_onehot[row][d[multi_skill]] = 1
opportunity_onehot[row][d[multi_skill]] = int(op)
s_t[multi_skill].append(row)
if d_u[user_ids[row]] == 'test':
d_t[user_ids[row]].append(counter)
counter += 1
row += 1

l, l1 = [], []
for row in range(len(user_ids)):
if d_u[user_ids[row]] == "train":
l.append(row)
if d_u[user_ids[row]] == "test":
l1.append(row)
X_train = [skill_onehot[l, :], opportunity_onehot[l, :], Y[l, 0]]
X_test = [(d_t, s_t), skill_onehot[l1, :], opportunity_onehot[l1, :], Y[l1, 0]]
return X_train, X_test

def load_data(data, user_train, user_test):

users = set(list(data["user_id"]))
user_train = sorted(list(users.intersection(set(user_train))))
user_test = sorted(list(users.intersection(set(user_test))))
print (len(user_test), len(user_train), len(users))
d_u = {j:'train' for j in list(user_train)}
for j in list(user_test):
d_u[j] = 'test'

X_train, X_test = one_hot(data[data["user_id"].isin(user_train+user_test)], d_u)
return X_train, X_test

def save_hd5f(fname, dname, data):

print ("HDF5 Saving Started")
h5f = h5py.File(fname, 'w')
h5f.create_dataset(dname, data=data)
h5f.close()
print ("HDF5 Saving Done")

def save_data(X_train, X_test, fname, utype=""):

pd.DataFrame(X_train).to_hdf(__init__.new_path + "Saved/Model/afm/"+fname+"."+utype+"train", "train")
pd.DataFrame(X_test).to_hdf(__init__.new_path + "Saved/Model/afm/"+fname+"."+utype+"test", "test")

def read_data(fname, utype):

X_train = pd.read_hdf(__init__.new_path + "Data/afm/"+fname+"."+utype+"train", "train")
X_test = pd.read_hdf(__init__.new_path + "Data/afm/"+fname+"."+utype+"test", "test")
return X_train.as_matrix(), X_test.as_matrix()
Binary file added DAFM/__pycache__/dafm.cpython-35.pyc
Binary file not shown.
Binary file added DAFM/__pycache__/load_data.cpython-35.pyc
Binary file not shown.
Loading

0 comments on commit d9a28f5

Please sign in to comment.