In [1]:
from Helper_functions import *
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from keras import regularizers

Using TensorFlow backend.


# Helper

In [2]:
def train(measure,units,layerz,shape,num,split):
    train_label = measure[0:split]
    test_label = measure[split:]
    model = build_model_R(units,layerz,"mae",shape)

    history = model.fit(
        train_data,train_label, epochs = num, batch_size = 100 ,validation_data = (test_data,test_label), verbose = 0 )

    print("minimum Training MAE: ", np.min(history.history['mean_absolute_error']))
    print("at: ", np.argmin(history.history['mean_absolute_error']))
    print("minimum Validation MAE: ", np.min(history.history['val_mean_absolute_error']))
    print("at: ", np.argmin(history.history['val_mean_absolute_error']))
    return model,history

def train_C(measure,units,layerz,shape,num,split):
    train_label = measure[0:split]
    test_label = measure[split:]
    model = build_model_C(units,layerz,"accuracy",shape)
    
    history = model.fit(
        train_data,train_label, epochs = num, batch_size = 100 ,validation_data = (test_data,test_label), verbose = 0 )

    print("minimum Training accuracy: ", np.max(history.history['acc']))
    print("at: ", np.argmax(history.history['acc']))
    print("minimum Validation accuracy: ", np.max(history.history['val_acc']))
    print("at: ", np.argmax(history.history['val_acc']))
    return model,history


def build_model_R(units,layerz,metric,shape,lozz= 'logcosh'):
    model = models.Sequential()
    model.add(layers.Dense(units, activation = 'relu', 
                           input_shape = (shape,)))
    for layer in range(layerz):
        model.add(layers.Dense(units, activation = 'relu'))
    model.add(layers.Dense(1)) #linear layer
    model.compile(optimizer = 'rmsprop', loss = lozz, metrics =[metric] )
    return model


    model.compile(optimizer = 'rmsprop', loss = lozz, metrics =[metric] )
def build_model_C(units,layerz,metric,shape,lozz = 'binary_crossentropy'):
    model = models.Sequential()
    model.add(layers.Dense(units, activation = 'relu', 
                           kernel_regularizer = regularizers.l2(0.001),
                           input_shape = (shape,)))
    for layer in range(layerz):
        model.add(layers.Dropout(0.4))
        model.add(layers.Dense(units, activation = 'relu'))
    model.add(layers.Dense(1, activation = "sigmoid")) 
    model.compile(optimizer = 'Adam', loss = lozz, metrics =[metric] )
    return model

def smooth_points(histories, factor=0.9):
    
    smoothed_points1 = []
    smoothed_points2 = []
    points = zip(histories['val_mean_absolute_error'],histories['mean_absolute_error'])
    
    for point1,point2 in  points:
        if smoothed_points1:
            previous = smoothed_points1[-1]
            smoothed_points1.append(previous * factor + point1*(1 - factor))
            previous = smoothed_points2[-1]
            smoothed_points2.append(previous * factor + point2*(1 - factor))
        else:
            smoothed_points1.append(point1)
            smoothed_points2.append(point2)
            
    smooth_histories = dict(
                            val_mean_absolute_error = smoothed_points1,
                            mean_absolute_error = smoothed_points2
                            )
    return smooth_histories


def plot(histories):
    epoch = range(1,len(histories['val_mean_absolute_error'])+1)[10:]
    f, axes = plt.subplots(1, 2, figsize=(12,12))
    axes = axes.reshape((2,))
  
    axes[0].plot(epoch,histories['val_mean_absolute_error'][10:], label='Training')
    axes[1].plot(epoch,histories['mean_absolute_error'][10:], 'r', label='Validation')
    axes[1].legend()
    axes[0].set_xlabel("Epoch")
    axes[1].set_xlabel("Epoch")
    axes[0].set_ylabel("MAE")

    axes[1].set_ylabel("Loss")
    
def plot_C(histories):
    epoch = range(1,len(histories['val_acc'])+1)[10:]
    f, axes = plt.subplots(1, 2, figsize=(12,12))
    axes = axes.reshape((2,))
  
    axes[0].plot(epoch,histories['val_acc'][10:], label='Training')
    axes[1].plot(epoch,histories['acc'][10:], 'r', label='Validation')
    axes[1].legend()
    axes[0].set_xlabel("Epoch")
    axes[1].set_xlabel("Epoch")
    axes[0].set_ylabel("accuracy")

    axes[1].set_ylabel("Loss")
    
def preprocess1(path1,path2,ratio):    
    pair1_df = pd.read_csv(path1)
    pair2_df = pd.read_csv(path2)
    data1 = pair1_df.loc[:,"dateorder":"liked"].values.astype("float")
    data2 = pair2_df.loc[:,"dateorder":"liked"].values.astype("float")
    data = np.concatenate((data1,data2),axis = 0)
    labels1 = pair1_df.loc[:,"liked":].values.astype("float")
    labels2 = pair2_df.loc[:,"liked":].values.astype("float")
    labels = np.concatenate((labels1,labels2),axis = 0)
    
    
    # randomize data and label set
    sample_size = data.shape[0]
    arr = np.arange(sample_size)
    data = data[arr].reshape(data.shape)
    labels = labels[arr].reshape(labels.shape)
    
    dependent_measures = dict(
    liked = labels[:,0], sexatt = labels[:,1], likyes = labels[:,2],
    fliked = labels[:,3], fsexatt = labels[:,4] , flikyes = labels[:,5],
    saidyes = labels[:,6], fsaidyes = labels[:,6], match = labels[:,7]
    )



    # partition train and test 
                                    # NOTE:may need to shuffle them
    split = int(ratio * data.shape[0])
    train_data = data[0:split]
    test_data = data[split:]

    # normalize
    mean = train_data.mean(axis = 0)
    std = train_data.std(axis = 0)

    train_data -= mean
    train_data /= std

    test_data -= mean
    test_data /= std
    
    return train_data,test_data, dependent_measures,split
    

# Preprocessing

In [4]:
path1,path2, ratio = "./pair1Dataframe.csv","./pair2Dataframe.csv" , 0.8

train_data, test_data, dependent_measures,split  = preprocess1(path1,path2,ratio)


In [3]:
path, ratio = "./pair2Dataframe.csv", 0.8
train_data2, test_data2, dependent_measures,split  = preprocess(path,ratio)

NameError: name 'preprocess' is not defined

# Benchmark (random)

In [5]:
#dependent_measures["FLikYes"]
random_predict = np.random.randint(1,10, size= (train_data.shape[0]) )

In [6]:
measures = ['liked', 'sexatt', "likyes", 'fliked', 'fsexatt', "flikyes"]
for measure in measures:
    score = ( dependent_measures[measure][0:split].astype('int64') == np.mean(dependent_measures[measure][0:split]).astype('int64')).sum()/train_data.shape[0]
    print(measure,score)

liked 0.16707317073170733
sexatt 0.1274390243902439
likyes 0.0975609756097561
fliked 0.14634146341463414
fsexatt 0.1048780487804878
flikyes 0.0676829268292683


# Type 2

## Training (using Classification)

In [3]:
path, ratio = "./pair2Dataframe.csv", 0.8
train_data, test_data, dependent_measures,split  = preprocess(path,ratio)

NameError: name 'preprocess' is not defined

In [None]:
units,layerz,shape, num = 64,5, train_data.shape[1] , 50000
match_model_C,history = train_C(dependent_measures["match"],units,layerz,shape,num,split)
plot_C(history.history)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.


In [None]:
units,layerz,shape, num = 64,3, train_data.shape[1] , 50000
saidyes_model_C,history = train_C(dependent_measures["saidyes"],units,layerz,shape,num,split)
plot_C(history.history)

In [None]:
units,layerz,shape, num = 64,3, train_data.shape[1] , 10000
fsaidyes_model_C,history = train_C(dependent_measures["fsaidyes"],units,layerz,shape,num,split)
plot_C(history.history)