In [2]:
pip install jovian --quiet

Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip3 install jovian



In [4]:
#library imports
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from scipy import sparse
import sys
import jovian

In [5]:
actual_set = pd.read_csv('actual_set.csv')
app_installs = pd.read_csv('app_installs.csv')
app_metadata = pd.read_csv('app_metadata.csv')
app_usage = pd.read_csv('app_usage.csv')
user_metadata = pd.read_csv('user_metadata.csv')
validation_data = pd.read_csv('validation_data.csv')

In [6]:
user_metadata.head()

Unnamed: 0,uid,device,device_category,state,city,network_type,user_lang,space_available
0,94698,SM-M215F,Mid,Assam,Dibrugarh,4G,en-US,94.32
1,257076,SM-J400F,Mass,West Bengal,Siliguri,,,
2,283805,SM-J415F,Mass,Jharkhand,Deogarh,wifi,en-US,11.45
3,248262,SM-A336E,High,Gujarat,Ahmedabad,4G,en-US,91.2
4,283806,SM-M205F,Mid,National Capital Territory of Delhi,Delhi,4G,en-US,42.62


In [7]:
app_usage = app_usage.drop('app_use_date', axis =1)

In [8]:
app_usage.head()

Unnamed: 0,uid,item_id,time_spent
0,185459,601235,2180211
1,164721,601235,4850939
2,22949,601235,1026713
3,1773,601235,419773
4,12539,601235,1609444


In [9]:
#Filtering users who spent time atleast more than 10 seconds
app_usage = app_usage.loc[app_usage['time_spent'] > 10000] 

#Converting milliseconds to minutes
app_usage['time_spent'] = app_usage['time_spent'].div(60000).round(2)

app_usage.head(5)

Unnamed: 0,uid,item_id,time_spent
0,185459,601235,36.34
1,164721,601235,80.85
2,22949,601235,17.11
3,1773,601235,7.0
4,12539,601235,26.82


In [10]:
app_usage.rename(columns = {'time_spent':'rating'}, inplace = True) 

In [11]:
app_usage.head(5)

Unnamed: 0,uid,item_id,rating
0,185459,601235,36.34
1,164721,601235,80.85
2,22949,601235,17.11
3,1773,601235,7.0
4,12539,601235,26.82


In [12]:
app_usage.isnull().sum()

uid        0
item_id    0
rating     0
dtype: int64

In [13]:
Counter(app_usage.rating)

Counter({36.34: 285,
         80.85: 109,
         17.11: 749,
         7.0: 1863,
         26.82: 441,
         14.88: 874,
         25.61: 467,
         6.85: 1872,
         36.51: 277,
         0.89: 9987,
         1.74: 6374,
         1.23: 8285,
         0.96: 9630,
         0.34: 16738,
         3.33: 3790,
         6.42: 2108,
         0.18: 24403,
         7.19: 1753,
         0.58: 13389,
         0.65: 12525,
         1.0: 9434,
         0.54: 14657,
         0.99: 9502,
         1.35: 7702,
         12.02: 1096,
         0.51: 14319,
         7.14: 1915,
         10.98: 1230,
         2.72: 4449,
         17.62: 787,
         5.54: 2320,
         2.34: 5227,
         12.89: 1056,
         0.42: 14971,
         0.93: 9891,
         2.35: 5127,
         52.42: 198,
         1.03: 9461,
         14.04: 944,
         0.2: 22879,
         3.9: 3214,
         8.49: 1514,
         1.63: 6750,
         5.72: 2272,
         1.28: 8037,
         2.16: 5450,
         5.51: 2277,
      

In [14]:
Counter(app_usage.groupby(['uid']).count()['item_id'])

Counter({26: 5518,
         6: 3730,
         20: 5782,
         28: 5441,
         83: 179,
         23: 5753,
         3: 2630,
         41: 3249,
         86: 137,
         71: 426,
         16: 5468,
         40: 3416,
         27: 5367,
         34: 4369,
         17: 5539,
         30: 4982,
         66: 622,
         76: 299,
         2: 2391,
         18: 5719,
         42: 3058,
         54: 1533,
         95: 89,
         51: 1795,
         31: 4982,
         49: 2074,
         22: 5719,
         7: 4134,
         24: 5644,
         37: 3984,
         13: 5045,
         39: 3604,
         12: 4930,
         9: 4332,
         19: 5665,
         35: 4243,
         14: 5330,
         38: 3816,
         62: 827,
         4: 2960,
         11: 4653,
         36: 4215,
         50: 2008,
         48: 2310,
         25: 5591,
         45: 2666,
         21: 5719,
         8: 4119,
         15: 5555,
         57: 1270,
         98: 61,
         33: 4622,
         29: 5198,
         4

In [15]:
#Average number of ratings per user
np.mean(app_usage.groupby(['uid']).count()['item_id'])

28.649443781612867

In [16]:
train_df, valid_df = train_test_split(app_usage, test_size=0.2)

#resetting indices to avoid indexing errors in the future
train_df = train_df.reset_index()[['uid', 'item_id', 'rating']]
valid_df = valid_df.reset_index()[['uid', 'item_id', 'rating']]

In [17]:
def encode_column(column):
    """ Encodes a pandas column with continous IDs"""
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [18]:
def encode_df(user_df):
    """Encodes rating data with continuous user and an item ids"""
    
    item_ids, user_df['item_id'], num_item = encode_column(user_df['item_id'])
    user_ids, user_df['uid'], num_users = encode_column(user_df['uid'])
    return user_df, num_users, num_item, user_ids, item_ids

In [19]:
user_df, num_users, num_item, user_ids, item_ids = encode_df(train_df)
print("Number of users :", num_users)
print("Number of apps :", num_item)
user_df.head()

Number of users : 237456
Number of apps : 43509


Unnamed: 0,uid,item_id,rating
0,0,0,4.45
1,1,1,6.23
2,2,2,3.68
3,3,3,6.96
4,4,4,47.56


In [20]:
def create_embeddings(n, K):
    """
    Creates a random numpy matrix of shape n, K with uniform values in (0, 11/K)
    n: number of items/users
    K: number of factors in the embedding 
    """
    return 11*np.random.random((n, K)) / K

In [21]:
def create_sparse_matrix(df, rows, cols, column_name="rating"):
    """ Returns a sparse utility matrix""" 
    return sparse.csc_matrix((df[column_name].values,(df['uid'].values, df['item_id'].values)),shape=(rows, cols))

In [22]:
user_df, num_users, num_item, user_ids, item_ids = encode_df(train_df)
Y = create_sparse_matrix(user_df, num_users, num_item)

In [23]:
Y.todense()

matrix([[21.81,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
        [ 0.  ,  6.23,  0.  , ...,  0.  ,  0.  ,  0.  ],
        [ 0.  ,  0.  ,  3.68, ...,  0.  ,  0.  ,  0.  ],
        ...,
        [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
        [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ],
        [ 0.  ,  0.  ,  0.  , ...,  0.  ,  0.  ,  0.  ]])

In [24]:
def predict(df, emb_user, emb_item):
    """ This function computes df["prediction"] without doing (U*V^T).
    
    Computes df["prediction"] by using elementwise multiplication of the corresponding embeddings and then 
    sum to get the prediction u_i*v_j. This avoids creating the dense matrix U*V^T.
    """
    df['prediction'] = np.sum(np.multiply(emb_item[df['item_id']],emb_user[df['uid']]), axis=1)
    return df

In [25]:
lmbda = 0.0002

In [26]:
def cost(df, emb_user, emb_item):
    """ Computes mean square error"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_item.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_item), emb_user.shape[0], emb_item.shape[0], 'prediction')
    return np.sum((Y-predicted).power(2))/df.shape[0] 

In [27]:
def gradient(df, emb_user, emb_item):
    """ Computes the gradient for user and item embeddings"""
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_item.shape[0])
    predicted = create_sparse_matrix(predict(df, emb_user, emb_item), emb_user.shape[0], emb_item.shape[0], 'prediction')
    delta =(Y-predicted)
    grad_user = (-2/df.shape[0])*(delta*emb_item) + 2*lmbda*emb_user
    grad_item = (-2/df.shape[0])*(delta.T*emb_user) + 2*lmbda*emb_item
    return grad_user, grad_item

In [28]:
def gradient_descent(df, emb_user, emb_item, iterations=2000, learning_rate=0.01, df_val=None):
    """ 
    Computes gradient descent with momentum (0.9) for given number of iterations.
    emb_user: the trained user embedding
    emb_item: the trained item embedding
    """
    Y = create_sparse_matrix(df, emb_user.shape[0], emb_item.shape[0])
    beta = 0.9
    grad_user, grad_item = gradient(df, emb_user, emb_item)
    v_user = grad_user
    v_item = grad_item
    for i in range(iterations):
        grad_user, grad_item = gradient(df, emb_user, emb_item)
        v_user = beta*v_user + (1-beta)*grad_user
        v_item = beta*v_item + (1-beta)*grad_item
        emb_user = emb_user - learning_rate*v_user
        emb_item = emb_item - learning_rate*v_item
        if(not (i+1)%50):
            print("\niteration", i+1, ":")
            print("train mse:",  cost(df, emb_user, emb_item))
            if df_val is not None:
                print("validation mse:",  cost(df_val, emb_user, emb_item))
    return emb_user, emb_item

In [29]:
emb_user = create_embeddings(num_users, 3)
emb_item = create_embeddings(num_item, 3)
emb_user, emb_item = gradient_descent(user_df, emb_user, emb_item, iterations=800, learning_rate=1)


iteration 50 :
train mse: 6015.186376456335

iteration 100 :
train mse: 5687.048604988224

iteration 150 :
train mse: 5489.0217810815975

iteration 200 :
train mse: 5305.234511131506

iteration 250 :
train mse: 5132.559546858968

iteration 300 :
train mse: 4971.033879431151


KeyboardInterrupt: 

In [None]:
def encode_new_data(valid_df, user_ids, item_ids):
    """ Encodes valid_df with the same encoding as train_df.
    """
    df_val_chosen = valid_df['item_id'].isin(item_ids.keys()) & valid_df['uid'].isin(user_ids.keys())
    valid_df = valid_df[df_val_chosen]
    valid_df['item_id'] =  np.array([item_ids[x] for x in valid_df['item_id']])
    valid_df['uid'] = np.array([user_ids[x] for x in valid_df['uid']])
    return valid_df

In [None]:
print("before encoding:", valid_df.shape)
valid_df = encode_new_data(valid_df, user_ids, item_ids)
print("after encoding:", valid_df.shape)

In [None]:
train_mse = cost(train_df, emb_user, emb_item)
val_mse = cost(valid_df, emb_user, emb_item)
print(train_mse, val_mse)

In [None]:
valid_df.head()

In [None]:
sub = valid_df[['uid','item_id']].copy()
sub.head()

In [None]:
sub = sub[sub.uid.isin(validation_data.uid)].astype({"uid":'object', "item_id":'object'})

In [None]:
sub.info()

In [None]:
sub = sub.astype({"uid":'object', "item_id":'object'})



In [None]:
sub.to_csv("submission_from_starter_script.csv", index = False, header = True)

In [30]:
import pandas as pd
sample_submission=pd.read_csv('sample_submission.csv')
stest=pd.read_csv('submission_from_starter_script.csv')
common = sample_submission.merge(stest, on=["uid"])
result = sample_submission[~sample_submission.uid.isin(common.uid)]
new=pd.concat([stest, result], ignore_index = True)
new.to_csv("submission_from_starter_script_modified.csv",index=False)