# Objective 
## 1. Build User-User collab Filtering
## 2. build item-item collab filtering
## 3. Add functunality to choose data-set size and perform collaborative filtering on that dataset
## 4. Build matrix Ordered according to preference of user 

In [1]:
#Import Libraries
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import pairwise_distances 
import math
from numba import jit
from datetime import datetime
import time 

In [2]:
#Used at the very end for getting Joke recommendation ordering for a user
@jit
def prepare_data(matrix_data):
    m = matrix_data.shape[0]  # Get number of Jokes
    n = matrix_data.shape[1]  # Get number of Users
    jokeids = np.reshape(matrix_data[:,0],[m,1])  # Get joke id in column format
    ones = np.ones((m,1))    #Initialize Ones ( Will be used later)
    data = np.ones((1,3))  # Initialize a temporary matrix
    for i in range(1,n):  # Loop through users
        user = i * ones  # Get user column
        ratings = np.reshape(matrix_data[:,i] , [m,1])  #Get ratings for the user
        temp = np.concatenate((user, jokeids,ratings), axis =1)   # Concatenate User, Jokeid and Ratings
        data = np.concatenate((data,temp))  # Append to data matrix
    data = np.delete(data,0,0)   # Remove first row of ones
   
    return data

### Part 1 
- Load Data 
- Choose data set size
- Remove NaN values
- Split data into train/test

In [3]:
#Load data-
df1 = np.load('fulldata_nn.npy')

In [4]:
#Remove NAN valued rows
df1 =  df1[~np.isnan(df1).any(axis=1)] 

In [5]:
#Select Dataset size
m = df1.shape[0]
dataset_size = 0.5
indx = math.floor(dataset_size*m)
df1 = df1[0:indx,:]

In [6]:
#Shuffle data-set
np.random.shuffle(df1)

In [7]:
#Extract User, JokeID as X and Ratings as Y
X = df1[:,0:-1]
Y = np.reshape(df1[:,-1] , [-1,1])
X_train,X_test, Y_train,Y_test = train_test_split(X,Y, test_size = 0.1)
Y_train = np.reshape(Y_train, [-1,1])
Y_test = np.reshape(Y_test,[-1,1])

In [8]:
#Convert to a matrix form 
@jit
def convert_to_matrix_1(dataset):
    #Find number of user and jokes
    n_users_base = np.unique(dataset[:,0]).shape[0]
    n_jokes_base = np.unique(dataset[:,1]).shape[0]
    #Initialize New matrix
    train_matrix = np.zeros((n_users_base, n_jokes_base))
    print(train_matrix.shape)
    for i in range(dataset.shape[0]):
        user = (dataset[i,0]-1).astype(int)
        joke = (dataset[i,1]-1).astype(int)
        rating = dataset[i,2]
        train_matrix[user,joke]=rating
    return train_matrix
    

In [9]:
#initialize
trainset = np.concatenate((X_train,Y_train), axis =1)
testset = np.concatenate((X_test,Y_test),axis =1 )


In [10]:
#Convert train set to matrix
train_matrix = convert_to_matrix_1(trainset)

(28556, 100)


In [11]:
#Get sparisity
zero_rating = (trainset[:,2]==0).sum()
sparsity = (train_matrix==0).sum()-zero_rating
print("Sparsity = ",sparsity)

Sparsity =  994306


In [12]:
tot_values = train_matrix.shape[0]*train_matrix.shape[1]
print("Total True value = ",tot_values-sparsity)

Total True value =  1861294


In [13]:
#Create copy
data = np.copy(train_matrix)

In [14]:
del df1,m, X,Y, train_matrix

### Run User-User Collaborative filtering
- Distance metric = Pearson Correlation( Cosine is used, but predictor centers the values by subtracting mean, hence pearson correlation)
- n_similar is set to 300, this metric can be changed

In [15]:
#Find user-user similarity 
user_similarity = pairwise_distances(data, metric='cosine')

In [16]:
#Predict user - user using top30
@jit
def predict_user_user(train_matrix, user_similarity, n_similar=30):
    similar_n = user_similarity.argsort()[:,-n_similar:][:,::-1]
    n_users_base = train_matrix.shape[0]
    n_items_base = train_matrix.shape[1]
    pred = np.zeros((n_users_base,n_items_base))
    
    for i,users in enumerate(similar_n):
        similar_users_indexes = users
        similarity_n = user_similarity[i,similar_users_indexes]
        matrix_n = train_matrix[similar_users_indexes,:]
            
        rated_items = similarity_n[:,np.newaxis].T.dot(matrix_n - matrix_n.mean(axis=1)[:,np.newaxis])/ similarity_n.sum()
        pred[i,:]  = rated_items
    return pred

In [17]:
#Use top 50 and then add average rating, which was used to normalize in predict_user_user
n_similar = 300
startTime = time.time()
predictions = predict_user_user(data,user_similarity, n_similar) + data.mean(axis=1)[:, np.newaxis]
print("Execution time is ", time.time()-startTime)

Execution time is  79.18859624862671


In [18]:
#This saves all predictions into a numpy matrix, each row is a user, and every column is the rating for that JokeID
np.save('user-user-predictions', predictions)

In [19]:
def get_highest_lowest_joke(predictions):
    m = predictions.shape[0]
    n = predictions.shape[1]
    matrix_joke = np.ones((1,n))
    for i in range(m):
        desc = np.reshape(np.flip(np.argsort(predictions[i,:])) , [1,-1])
        matrix_joke = np.concatenate((matrix_joke,desc))
    return matrix_joke


In [20]:
#Get test predictions
#Load jokeTextDataset and display top n jokes for the user
joke_text = pd.read_csv('JokeText.csv')
def get_test_preds(preds, testval):
    m = testval.shape[0]
    test_preds = np.empty((m,1))
    for i in range(m):
        user = int(testval[i,0]-1)
        jokeid = int(testval[i,1]-1)
        rating = preds[user,jokeid]
        test_preds[i,0] = rating
    return test_preds
        

In [21]:
# n_jokes has to be less than 100
def display_top_n(Userid,user_joke_matrix,joke_text,n_jokes):
    user_pref = user_joke_matrix[Userid-1,:]
    jokes = user_pref[0:n_jokes]
    jokes = jokes -1
    for i in list(joke_text.iloc[jokes,:].JokeText):
        print(i)
        print("-------------------------------------")


In [22]:
#Get test preds
test_preds = get_test_preds(predictions,testset)
#Find error in each predictions
e = np.abs(np.subtract(test_preds, np.reshape(testset[:,2], [-1,1])))
#Find average error
tot = e.shape[0] * e.shape[1]
print((e.sum().sum())/tot)

4.206234887321716


In [23]:
#Arrange joke from highest desirable to lowest desirable/user
rated_joke_user = get_highest_lowest_joke(predictions)

In [24]:
#Save user-user matrix with highest desirable to lowest desirable
np.save("user-user-cf-sorted", rated_joke_user)

In [32]:
#Display top n jokes
UserID = 10
n_jokes = 10
display_top_n(UserID,rated_joke_user,joke_text,n_jokes)

There once was a man and a woman that both  got in  a terrible car wreck. Both of their vehicles  
were completely destroyed, buy fortunately, no one  was   hurt.  In thankfulness, the woman said to the 
man, 'We are both okay, so we should celebrate. I have   a  bottle of wine in my car, let's open it.'         
So the woman got the bottleout of the car, and  handed it to the man. The man took a really big drink, 
and handed the woman the bottle. The  woman  closed the bottle and put it down. The man  asked,  
'Aren't you going to take a drink?' 

The woman cleverly replied, 
'No, I think I'll  just  wait for the cops to get here.'

-------------------------------------
Q: What's the difference between the government  and  the Mafia?

A: One of them is organized.

-------------------------------------
On the first day of college, the Dean addressed the students,
pointing out some of the rules:

"The female dormitory will be out-of-bounds for all male students
and the male dormitory to

### Run item-item collaborative filtering
- Distance metric = Pearson Correlation( Cosine is used, but predictor centers the values by subtracting mean, hence pearson correlation)
- n_similar is set to 100, this metric can be changed


In [26]:
#Item similarity
item_similarity = pairwise_distances(data.T, metric = 'cosine')
item_similarity.shape

(100, 100)

In [27]:
#Define function to prediction using item-item
@jit
def predict_item_item(train_matrix, item_similarity, n_similar=30):
    n_users_base = train_matrix.shape[0]
    n_items_base = train_matrix.shape[1]
    similar_n = item_similarity.argsort()[:,-n_similar:][:,::-1]
    print('similar_n shape: ', similar_n.shape)
    pred = np.zeros((n_users_base,n_items_base))
    for i,items in enumerate(similar_n):
        similar_items_indexes = items
        similarity_n = item_similarity[i,similar_items_indexes]
        matrix_n = train_matrix[:,similar_items_indexes]
        rated_items = matrix_n.dot(similarity_n)/similarity_n.sum()
        pred[:,i]  = rated_items
    return pred


In [28]:
#Find predictions
n_similar = 100
predictions = predict_item_item(data,item_similarity,n_similar)
print('predictions shape ',predictions.shape)
np.save('item-item-predictions', predictions)

similar_n shape:  (100, 100)
predictions shape  (28556, 100)


In [29]:
#Get test preds
test_preds = get_test_preds(predictions,testset)
#Find error in each predictions
e = np.abs(np.subtract(test_preds, np.reshape(testset[:,2], [-1,1])))
#Find average error
tot = e.shape[0] * e.shape[1]
print((e.sum().sum())/tot)

3.970462587174132


In [30]:
#Arrange joke from highest desirable to lowest desirable/user
rated_joke_user = get_highest_lowest_joke(predictions)
#Save user-user matrix with highest desirable to lowest desirable
np.save("item-item-cf-sorted", rated_joke_user)

In [33]:
#Display top n jokes
UserID = 10
n_jokes = 5
display_top_n(UserID,rated_joke_user,joke_text,n_jokes)

There once was a man and a woman that both  got in  a terrible car wreck. Both of their vehicles  
were completely destroyed, buy fortunately, no one  was   hurt.  In thankfulness, the woman said to the 
man, 'We are both okay, so we should celebrate. I have   a  bottle of wine in my car, let's open it.'         
So the woman got the bottleout of the car, and  handed it to the man. The man took a really big drink, 
and handed the woman the bottle. The  woman  closed the bottle and put it down. The man  asked,  
'Aren't you going to take a drink?' 

The woman cleverly replied, 
'No, I think I'll  just  wait for the cops to get here.'

-------------------------------------
Q: What's the difference between the government  and  the Mafia?

A: One of them is organized.

-------------------------------------
On the first day of college, the Dean addressed the students,
pointing out some of the rules:

"The female dormitory will be out-of-bounds for all male students
and the male dormitory to