In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as py
import tensorflow as tf
import seaborn as sns
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras import regularizers
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
with open('/Users/adityagoyal/Desktop/ggml/diabetes_prediction_dataset.csv') as f:
    df = pd.read_csv(f)

In [3]:
le_smoker = LabelEncoder()
df['smoking_history'] = le_smoker.fit_transform(df['smoking_history'])
print("Label mapping:", dict(zip(le_smoker.classes_, le_smoker.transform(le_smoker.classes_))))

Label mapping: {'No Info': np.int64(0), 'current': np.int64(1), 'ever': np.int64(2), 'former': np.int64(3), 'never': np.int64(4), 'not current': np.int64(5)}


In [5]:
le_gender = LabelEncoder()
df['gender'] = le_gender.fit_transform(df['gender'])
print("Label mapping:", dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_))))

Label mapping: {np.int64(0): np.int64(0), np.int64(1): np.int64(1), np.int64(2): np.int64(2)}


In [6]:
X = df.iloc[:,:len(df.columns)-1].to_numpy()

In [7]:
y = df.iloc[:,-1].to_numpy()

In [None]:
def split(x,y):
  x_train,x_,y_train,y_ = train_test_split(x,y,train_size=0.7,random_state=1)
  x_cv , x_test , y_cv, y_test = train_test_split(x_,y_,train_size=0.5,random_state=1)
  return x_train,y_train,x_cv,y_cv,x_test,y_test

In [None]:

def modelF(x_train,y_train,learningRate, regRate):

  model = Sequential([
    Dense(units = 4, activation = 'relu',kernel_regularizer=regularizers.l2(regRate)),
    Dense(units = 4, activation = 'relu',kernel_regularizer=regularizers.l2(regRate)),
    Dense(units = 2, activation = 'softmax',kernel_regularizer=regularizers.l2(regRate))
    ]
   )  

  model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=learningRate),loss = SparseCategoricalCrossentropy())
  model.fit(x_train,y_train,epochs = 50)
  
  return learningRate, regRate, model

In [None]:
def trainAndCv(model,x_cv,y_cv,x_train,y_train):

  y_train_prob = model.predict(x_train)
  y_train_pred = np.zeros(len(y_train_prob))

  y_cv_prob = model.predict(x_cv)
  y_cv_pred = np.zeros(len(y_cv_prob))

  j=0
  for i in y_train_prob:
    y_train_pred[j] = np.argmax(i)
    j+=1
    
  j=0
  for i in y_cv_prob:
    y_cv_pred[j] = np.argmax(i)
    j+=1
    
  y_train_pred = y_train_pred.reshape(len(y_train_prob),1)
  y_cv_pred = y_cv_pred.reshape(len(y_cv_prob),1)

  train_err = np.mean(y_train_pred!=y_train)
  cv_err = np.mean(y_cv_pred!=y_cv)

  return train_err,cv_err, y_train_pred,y_cv_pred

In [None]:
def testError(model,x_test,y_test):
    y_test_prob = model.predict(x_test)
    y_test_pred = np.zeros(len(y_test_prob))
    
    i=0
    for j in y_test_prob:
        y_test_pred[i] = np.argmax(j)
        i+=1
    y_test_pred = y_test_pred.reshape(len(y_test_prob),1)
    test_err = np.mean(y_test_pred!=y_test)
    return test_err,y_test_pred

In [None]:
def bestModel(modelErrors,overfitCriteria):
    
    errors = []
    dic = defaultdict(list)
    
    # computing the mse errors
    for i in modelErrors:
        
        train_err = i[0]
        cv_err = i[1]
        test_err = i[2]
        
        error = (train_err-cv_err )**2 + (train_err-test_err )**2   # test and cv error is always almost same , so no point taking diff of that
        errors.append(error)
    
    
    # integrating mses to their corresponding indexes
    j=0
    for i in errors:
        dic[i].append(j)
        j+=1
    
    
    # sorting the mses from least to largest
    sortedLise = sorted(dic.items())
        
    # seprate all indexes of models that have low variance
    fitLise = []
    for i in sortedLise:
        if(i[0]<overfitCriteria):
            for j in i[1]:
              fitLise.append(j)
 
 
     # iterate through all the indexes and check if has least cv error
    cv_error = 100
    index = -1
    
    for i in fitLise:
        if modelErrors[i][1] <cv_error:
            cv_error = modelErrors[i][1] 
            index = i   

    # return index of the best model
    return index 

In [None]:
SAMPLE_LEARNING_RATES = [0.01,0.02]
SAMPLE_REG_RATES = [0.01,0.03]
models = []
OVERFIT_CRITERIA = 28

ARRAY_SIZE = len(SAMPLE_LEARNING_RATES)*len(SAMPLE_REG_RATES)

train_errors = np.zeros(ARRAY_SIZE)
cv_errors = np.zeros(ARRAY_SIZE)
test_errors =  np.zeros(ARRAY_SIZE)
model_errors = np.zeros((ARRAY_SIZE,3))
learningRates = np.zeros(ARRAY_SIZE)
regRates = np.zeros(ARRAY_SIZE)

i=0
for j in SAMPLE_LEARNING_RATES:
    for k in SAMPLE_REG_RATES:
        x_train,y_train,x_cv,y_cv,x_test,y_test = split(X,y)
        lRate,regRate,model = modelF(x_train,y_train,j,k)
        train_err,cv_err,train_pred,cv_pred =  trainAndCv(model,x_cv,y_cv,x_train,y_train)
        test_err,test_pred =                   testError(model,x_test,y_test)
        models.append(model)
        
        test_errors[i] = test_err
        train_errors[i] = train_err
        cv_errors[i] = cv_err
        model_error = [train_err,cv_err,test_err]
        model_errors[i] = model_error
        
        learningRates[i] = lRate
        regRates[i] = regRate   
        
        i+=1

In [None]:
print(f"Errors of all the models: \n{model_errors}\n")

indexOfBest = bestModel(model_errors,OVERFIT_CRITERIA)

print(f"Summary of the best model: ")
models[indexOfBest].summary()

print("Additional info about best model-: \n")
print(f"Error: {model_errors[indexOfBest]}")
print(f"Index: {indexOfBest}")
print(f"Learning Rate: {learningRates[indexOfBest]}")
print(f"regularization rate: {regRates[indexOfBest]}\n\n")

In [None]:
import pickle as pl

In [None]:
with open('model.pkl','wb') as f:
    pl.dump(models[indexOfBest],f)

In [53]:
with open('model.pkl','rb') as f:
    model = pl.load(f)

In [56]:
model.predict(np.array([[0,25,1,1,4,28.9,6.5,120]]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step


array([[0.99656624, 0.00343379]], dtype=float32)

In [None]:
X

In [15]:
df[df['diabetes']==1]['blood_glucose_level'].value_counts()

blood_glucose_level
280    729
160    696
130    692
300    674
159    666
145    662
200    647
126    636
240    636
260    635
140    625
220    603
155    599
Name: count, dtype: int64

In [52]:
type(model.get_weights()[0])

numpy.ndarray

In [None]:
# each of elements int he array rerpesents the wiehgt and bias
# each row repreosents the weights for that prev unit.
# the column rperessents a current units weights
# relu(matrix.T @ input + col(b)) # 2 times  (2 hidden layers)
# softmax(matrix.T @ input + col(b)) # last time (1 output layer)

# softmax(arr[])
# exp power of the logits 
# ei/sum of e

In [None]:
# (softmax)(2 into 4)(relu)(4 into 4)(relu)(4 into 8) (8 into 1)

In [None]:
# ggml_relu

In [None]:
# firs tof all comptuation has to be done layer by layer in ggml fo the nueral network i a trying to implement...for a aproucalr layer, what would be the different if i intialise different result tensor 
# emtdaat for each unit instrad of together...

# good question to think about

In [None]:
# does 2d work with 1d
# is 1 d row or column orientated by default

In [None]:
np.transpose(model.get_weights()[5]).reshape(-1)

In [None]:
np.transpose(model.get_weights()[2])