## regression

In [35]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
import pandas as pd
from itertools import product

n_user = 100
n_item = 1000
n_rank = 10
n_user_group = 5
n_item_group = 10
n_user_feature = 5
n_item_feature = 5
noise_sigma = 0.01

## MF part
user_id, user_group = make_blobs(n_samples=n_user, centers=n_user_group, n_features=n_rank,
                  cluster_std=0.1, center_box=(-1, 1), random_state=0)
item_id, item_group = make_blobs(n_samples=n_item, centers=n_item_group, n_features=n_rank,
                  cluster_std=0.1, center_box=(-1, 1), random_state=1)
full_matrix_mf = np.tensordot(user_id, item_id, axes=[1, 1])

## user item features
np.random.seed(1)
user_feature = np.random.uniform(0, 1, (n_user, n_user_feature))
np.random.seed(0)
item_feature = np.random.uniform(0, 1, (n_item, n_item_feature))

inter_i1_u2=[]
for i,j in product(user_feature[:,2],item_feature[:,1]):
    inter_i1_u2.append(0.5*np.exp(-4*(i+j)+4))
inter_i1_u2 = np.array(inter_i1_u2).reshape(n_user,n_item)

inter_i2_u1=[]
for i,j in product(user_feature[:,1],item_feature[:,2]):
    inter_i2_u1.append(5*np.sin(2*np.pi*i*j))
inter_i2_u1 = np.array(inter_i2_u1).reshape(n_user,n_item)
full_matrix_feature =  5 * user_feature[:, 0].reshape(-1,1) +  (5 * item_feature[:, 0] ** 2).reshape(1,-1) + inter_i1_u2 + inter_i2_u1

# full matrix
np.random.seed(0)
full_matrix = 0.1*full_matrix_mf + full_matrix_feature + noise_sigma * np.random.normal(0, 1, (n_user, n_item))
print(full_matrix)
binary_full_matrix = 1.0 * (full_matrix > 0).astype(int)

missing_rate = 0.9
np.random.seed(0)
input_mask_data = np.random.uniform(0, 1, binary_full_matrix.shape)
binary_full_matrix[input_mask_data <= missing_rate] = np.nan
full_matrix[np.isnan(binary_full_matrix)]=np.nan

pair = []
for i in range(full_matrix.shape[0]):
    for j in range(full_matrix.shape[1]):
        if np.isnan(full_matrix[i,j])==False:
            pair.append([i,j,full_matrix[i,j]])
            
pair = np.array(pair)
b= range(user_feature.shape[0])
user_feature = np.insert(user_feature, 0, values=b, axis=1)

b= range(item_feature.shape[0])
item_feature = np.insert(item_feature, 0, values=b, axis=1)

user_feature_d = pd.DataFrame(user_feature,columns=['u_id','uf_1','uf_2','uf_3','uf_4','uf_5'])
item_feature_d = pd.DataFrame(item_feature,columns=['i_id','if_1','if_2','if_3','if_4','if_5'])

pair_d = pd.DataFrame(pair,columns=['user_id','item_id','target'])

pu=  pd.merge(pair_d,user_feature_d,left_on='user_id',right_on='u_id')
puv=  pd.merge(pu,item_feature_d,left_on='item_id',right_on='i_id')
puv.drop(['u_id','i_id'],1,inplace=True)

target = puv.target
user_id = puv.user_id
item_id = puv.item_id
data = puv.drop(['target','user_id','item_id'],1)
data = pd.concat([data,user_id,item_id,target],1)

data.to_csv('sim_0.9.csv',index=None)


[[ 7.11543722  4.91853929 11.17624764 ...  3.93190744 17.9309684
  16.99133486]
 [ 5.52978497  8.00198594  7.47153809 ...  8.36153068  4.87766179
   4.6576346 ]
 [ 6.77306402  2.99561535  9.8204663  ...  2.30691438 11.52009062
  10.97671608]
 ...
 [10.66664316 14.75199909 13.7965958  ... 14.33173802 16.3550069
  15.4749298 ]
 [ 2.75340956  1.04324924  5.7059259  ...  1.86860187  9.64693103
   9.23106361]
 [ 4.17390719  2.69391806  7.00289632 ...  4.07243355 10.37411099
  10.02865874]]


## Classification

In [80]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

n_user = 100
n_item = 1000
n_rank = 10
n_user_group = 5
n_item_group = 10
n_user_feature = 5
n_item_feature = 5
noise_sigma = 0.01

## MF part
user_id, user_group = make_blobs(n_samples=n_user, centers=n_user_group, n_features=n_rank,
                  cluster_std=0.1, center_box=(-1, 1), random_state=0)
item_id, item_group = make_blobs(n_samples=n_item, centers=n_item_group, n_features=n_rank,
                  cluster_std=0.1, center_box=(-1, 1), random_state=1)
full_matrix_mf = np.tensordot(user_id, item_id, axes=[1, 1])

## user item features
np.random.seed(1)
user_feature = np.random.uniform(0, 1, (n_user, n_user_feature))
np.random.seed(0)
item_feature = np.random.uniform(0, 1, (n_item, n_item_feature))

inter_i1_u2=[]
for i,j in product(user_feature[:,2],item_feature[:,1]):
    inter_i1_u2.append(0.5*np.exp(-4*(i+j)+4))
inter_i1_u2 = np.array(inter_i1_u2).reshape(n_user,n_item)

inter_i2_u1=[]
for i,j in product(user_feature[:,1],item_feature[:,2]):
    inter_i2_u1.append(5*np.sin(2*np.pi*i*j))
inter_i2_u1 = np.array(inter_i2_u1).reshape(n_user,n_item)
full_matrix_feature =  5 * user_feature[:, 0].reshape(-1,1) +  (5 * item_feature[:, 0] ** 2).reshape(1,-1) + inter_i1_u2 + inter_i2_u1

'''
user_feature_effect = 5 * user_feature[:, 0] + 0.5 * np.exp(-4 * (user_feature[:, 1] + user_feature[:, 2]) + 4)
item_feature_effect = 5 * item_feature[:, 0] ** 2 + 5 * np.sin(2 * np.pi * item_feature[:, 1] * item_feature[:, 2] )
full_matrix_feature = user_feature_effect.reshape(-1, 1) + item_feature_effect.reshape(1, -1)
'''
# full matrix
np.random.seed(0)
full_matrix = 10*full_matrix_mf + full_matrix_feature + noise_sigma * np.random.normal(0, 1, (n_user, n_item))

binary_full_matrix = 1.0 * (full_matrix > 0).astype(int)

model = MinMaxScaler()
full_matrix = model.fit_transform(full_matrix.T).T

full_matrix[full_matrix>=0.5]=1
full_matrix[full_matrix<0.5]=0

full_matrix[full_matrix<0]=0
full_matrix[full_matrix>1]=1

missing_rate = 0.9
np.random.seed(0)
input_mask_data = np.random.uniform(0, 1, binary_full_matrix.shape)
binary_full_matrix[input_mask_data <= missing_rate] = np.nan
full_matrix[np.isnan(binary_full_matrix)]=np.nan

pair = []
for i in range(full_matrix.shape[0]):
    for j in range(full_matrix.shape[1]):
        if np.isnan(full_matrix[i,j])==False:
            pair.append([i,j,full_matrix[i,j]])
            
pair = np.array(pair)
b= range(user_feature.shape[0])
user_feature = np.insert(user_feature, 0, values=b, axis=1)

b= range(item_feature.shape[0])
item_feature = np.insert(item_feature, 0, values=b, axis=1)

user_feature_d = pd.DataFrame(user_feature,columns=['u_id','uf_1','uf_2','uf_3','uf_4','uf_5'])
item_feature_d = pd.DataFrame(item_feature,columns=['i_id','if_1','if_2','if_3','if_4','if_5'])

pair_d = pd.DataFrame(pair,columns=['user_id','item_id','target'])

pu=  pd.merge(pair_d,user_feature_d,left_on='user_id',right_on='u_id')
puv=  pd.merge(pu,item_feature_d,left_on='item_id',right_on='i_id')
puv.drop(['u_id','i_id'],1,inplace=True)

target = puv.target
user_id = puv.user_id
item_id = puv.item_id
data = puv.drop(['target','user_id','item_id'],1)
data = pd.concat([data,user_id,item_id,target],1)

data.to_csv('sim_binary_0.9_2.csv',index=None)

In [75]:
full_matrix[full_matrix>0.43194]

(50000,)