## regression

In [2]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
import pandas as pd

n_user = 100
n_item = 1000
n_rank = 10
n_user_group = 5
n_item_group = 10
n_user_feature = 5
n_item_feature = 5
noise_sigma = 0.01

## MF part
user_id, user_group = make_blobs(n_samples=n_user, centers=n_user_group, n_features=n_rank,
                  cluster_std=0.1, center_box=(-1, 1), random_state=0)
item_id, item_group = make_blobs(n_samples=n_item, centers=n_item_group, n_features=n_rank,
                  cluster_std=0.1, center_box=(-1, 1), random_state=1)
full_matrix_mf = np.tensordot(user_id, item_id, axes=[1, 1])

## user item features
np.random.seed(0)
user_feature = np.random.uniform(0, 1, (n_user, n_user_feature))
np.random.seed(0)
item_feature = np.random.uniform(0, 1, (n_item, n_item_feature))

user_feature_effect = 5 * user_feature[:, 0] + 0.5 * np.exp(-4 * (user_feature[:, 1] + user_feature[:, 2]) + 4)
item_feature_effect = 5 * item_feature[:, 0] ** 2 + 5 * np.sin(2 * np.pi * item_feature[:, 1] * item_feature[:, 2] )
full_matrix_feature = user_feature_effect.reshape(-1, 1) + item_feature_effect.reshape(1, -1)

# full matrix
np.random.seed(0)
full_matrix = 0.1*full_matrix_mf + full_matrix_feature + noise_sigma * np.random.normal(0, 1, (n_user, n_item))
print(full_matrix)
binary_full_matrix = 1.0 * (full_matrix > 0).astype(int)

missing_rate = 0.9
np.random.seed(0)
input_mask_data = np.random.uniform(0, 1, binary_full_matrix.shape)
binary_full_matrix[input_mask_data <= missing_rate] = np.nan
full_matrix[np.isnan(binary_full_matrix)]=np.nan

pair = []
for i in range(full_matrix.shape[0]):
    for j in range(full_matrix.shape[1]):
        if np.isnan(full_matrix[i,j])==False:
            pair.append([i,j,full_matrix[i,j]])
            
pair = np.array(pair)
b= range(user_feature.shape[0])
user_feature = np.insert(user_feature, 0, values=b, axis=1)

b= range(item_feature.shape[0])
item_feature = np.insert(item_feature, 0, values=b, axis=1)

user_feature_d = pd.DataFrame(user_feature,columns=['u_id','uf_1','uf_2','uf_3','uf_4','uf_5'])
item_feature_d = pd.DataFrame(item_feature,columns=['i_id','if_1','if_2','if_3','if_4','if_5'])

pair_d = pd.DataFrame(pair,columns=['user_id','item_id','target'])

pu=  pd.merge(pair_d,user_feature_d,left_on='user_id',right_on='u_id')
puv=  pd.merge(pu,item_feature_d,left_on='item_id',right_on='i_id')
puv.drop(['u_id','i_id'],1,inplace=True)

target = puv.target
user_id = puv.user_id
item_id = puv.item_id
data = puv.drop(['target','user_id','item_id'],1)
data = pd.concat([data,user_id,item_id,target],1)

data.to_csv('sim_0.9.csv',index=None)


[[ 6.44172921  8.058718   10.73569269 ...  6.6408403   4.68765424
   4.69435679]
 [ 6.89543819  8.57366631 11.2136446  ...  7.14182168  5.19376159
   5.1820424 ]
 [ 7.7849071   9.49865454 12.11508605 ...  8.04812638  6.10585564
   6.16380269]
 ...
 [ 6.84998095  8.41626529 11.12021372 ...  6.9849165   5.05959146
   4.94358187]
 [ 5.04195495  6.78265269  9.36182525 ...  5.14289549  3.17819307
   3.05990522]
 [ 5.78011576  7.59402613 10.23005484 ...  6.22908345  4.26426756
   4.10112939]]


## Classification

In [2]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

n_user = 100
n_item = 1000
n_rank = 10
n_user_group = 5
n_item_group = 10
n_user_feature = 5
n_item_feature = 5
noise_sigma = 0.01

## MF part
user_id, user_group = make_blobs(n_samples=n_user, centers=n_user_group, n_features=n_rank,
                  cluster_std=0.1, center_box=(-1, 1), random_state=0)
item_id, item_group = make_blobs(n_samples=n_item, centers=n_item_group, n_features=n_rank,
                  cluster_std=0.1, center_box=(-1, 1), random_state=1)
full_matrix_mf = np.tensordot(user_id, item_id, axes=[1, 1])

## user item features
np.random.seed(0)
user_feature = np.random.uniform(0, 1, (n_user, n_user_feature))
np.random.seed(0)
item_feature = np.random.uniform(0, 1, (n_item, n_item_feature))

user_feature_effect = 5 * user_feature[:, 0] + 0.5 * np.exp(-4 * (user_feature[:, 1] + user_feature[:, 2]) + 4)
item_feature_effect = 5 * item_feature[:, 0] ** 2 + 5 * np.sin(2 * np.pi * item_feature[:, 1] * item_feature[:, 2] )
full_matrix_feature = user_feature_effect.reshape(-1, 1) + item_feature_effect.reshape(1, -1)

# full matrix
np.random.seed(0)
full_matrix = full_matrix_mf + full_matrix_feature + noise_sigma * np.random.normal(0, 1, (n_user, n_item))

binary_full_matrix = 1.0 * (full_matrix > 0).astype(int)

model = MinMaxScaler()
full_matrix = model.fit_transform(full_matrix.T).T

full_matrix[full_matrix>=0.6]=1
full_matrix[full_matrix<0.6]=0

full_matrix[full_matrix<0]=0
full_matrix[full_matrix>1]=1

# p_0 = 1- full_matrix
# for i in range(full_matrix.shape[0]):
#     for j in range(full_matrix.shape[1]):
#         full_matrix[i][j] = np.random.choice([0,1],p=np.array([full_matrix[i][j],p_0[i][j]]).ravel())

missing_rate = 0.9
np.random.seed(0)
input_mask_data = np.random.uniform(0, 1, binary_full_matrix.shape)
binary_full_matrix[input_mask_data <= missing_rate] = np.nan
full_matrix[np.isnan(binary_full_matrix)]=np.nan

pair = []
for i in range(full_matrix.shape[0]):
    for j in range(full_matrix.shape[1]):
        if np.isnan(full_matrix[i,j])==False:
            pair.append([i,j,full_matrix[i,j]])
            
pair = np.array(pair)
b= range(user_feature.shape[0])
user_feature = np.insert(user_feature, 0, values=b, axis=1)

b= range(item_feature.shape[0])
item_feature = np.insert(item_feature, 0, values=b, axis=1)

user_feature_d = pd.DataFrame(user_feature,columns=['u_id','uf_1','uf_2','uf_3','uf_4','uf_5'])
item_feature_d = pd.DataFrame(item_feature,columns=['i_id','if_1','if_2','if_3','if_4','if_5'])

pair_d = pd.DataFrame(pair,columns=['user_id','item_id','target'])

pu=  pd.merge(pair_d,user_feature_d,left_on='user_id',right_on='u_id')
puv=  pd.merge(pu,item_feature_d,left_on='item_id',right_on='i_id')
puv.drop(['u_id','i_id'],1,inplace=True)

target = puv.target
user_id = puv.user_id
item_id = puv.item_id
data = puv.drop(['target','user_id','item_id'],1)
data = pd.concat([data,user_id,item_id,target],1)

data.to_csv('sim_binary_0.9_2.csv',index=None)