# import package

In [30]:
import pandas as pd
from sklearn import cross_validation as cv
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from math import sqrt
import plotly.offline as py
import plotly.graph_objs as go
py.offline.init_notebook_mode()

# Recommendation System

#### A recommender system or a recommendation system (sometimes replacing "system" with a synonym such as platform or engine) is a subclass of information filtering system that seeks to predict the "rating" or "preference" that a user would give to an item.

In [31]:
 df = pd.read_table('/home/user/Downloads/ml-100k/u.data',header = None , names = ['user_id' ,'item_id','rating' , 'timestamp'])

In [32]:
n_user = df.user_id.unique().shape[0]
t_user = df.item_id.unique().shape[0]

## train data and test data 


In [33]:
train_data, test_data = cv.train_test_split(df , test_size = 0.25)


# collaborative filltering
##### Collaborative filtering has two senses, a narrow one and a more general one. In the newer, narrower sense, collaborative filtering is a method of making automatic predictions (filtering) about the interests of a user by collecting preferences or taste information from many users (collaborating).

In [34]:
train_data_matrix = np.zeros((n_user,t_user))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1,line[2]-1] = line[3]
test_data_matrix = np.zeros((n_user,t_user))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1,line[2]-1]=line[3]
    

##### in collaborative filtering used memory based model(user-user,item-item)

#### Cosine similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. The cosine of 0° is 1, and it is less than 1 for any other angle.

#### first i calculated pairwise_distance then subtract it to one
## pairwise_ distance
#### Compute the distance matrix from a vector array x and optional y.
##### now i calculated user-user similarity and item -item similariy

In [35]:
user_similarity  = 1-pairwise_distances(train_data_matrix , metric = 'cosine')
item_similarity  = 1-pairwise_distances(train_data_matrix.T , metric = 'cosine')


## i made function for predicting rating for item-item and user-user

In [36]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred



### dataframe of user_predict and item_predict
##### we got here prdiction of each movie rating
###### using item-item and user-user

In [37]:
user_predict = predict(train_data_matrix, user_similarity , type ='user')
item_predict = predict(train_data_matrix, item_similarity , type ='item')  
pd.DataFrame(user_predict).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,2.079049,0.765519,0.531105,1.18719,0.526093,0.29629,1.753795,1.229581,1.331042,0.560196,...,0.205595,0.202104,0.202104,0.204941,0.207722,0.20245,0.20314,0.202795,0.208363,0.209014
1,1.240993,0.165565,0.170567,0.41263,0.12179,0.038135,1.109801,0.484671,0.92581,0.226213,...,-0.048342,-0.051569,-0.051569,-0.048755,-0.047954,-0.04999,-0.046832,-0.048411,-0.048279,-0.050002
2,0.912973,0.133186,0.106289,0.351015,0.088083,-0.017842,0.898859,0.389065,0.66024,0.128473,...,-0.074946,-0.07716,-0.07716,-0.072582,-0.073546,-0.073628,-0.066566,-0.070097,-0.075385,-0.075754
3,1.06202,0.148181,0.097822,0.375527,0.08895,-0.038101,0.97208,0.413254,0.674375,0.089758,...,-0.09274,-0.094649,-0.094649,-0.09185,-0.090482,-0.091626,-0.085581,-0.088604,-0.090688,-0.09225
4,1.989035,0.619213,0.316228,0.994336,0.326951,0.057173,1.608163,1.053861,1.0246,0.2929,...,-0.023123,-0.026531,-0.026531,-0.02447,-0.023673,-0.026531,-0.026531,-0.026531,-0.019238,-0.020855


In [38]:
pd.DataFrame(item_predict).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,0.929182,0.886841,0.856573,0.89609,0.815712,0.632329,0.889777,0.899395,0.892507,0.883191,...,0.769779,0.0,0.0,0.614146,0.950289,0.131463,0.131463,0.131463,0.784921,1.020307
1,0.188809,0.115792,0.167056,0.139239,0.146724,0.185439,0.196125,0.158694,0.212254,0.189933,...,0.274169,0.0,0.0,0.254568,0.227542,0.230985,0.230985,0.230985,0.142975,0.079877
2,0.07068,0.055003,0.077765,0.066875,0.064131,0.06942,0.086178,0.065179,0.082194,0.07726,...,0.095265,0.0,0.0,0.217517,0.111499,0.323307,0.323307,0.323307,0.035947,0.024549
3,0.072346,0.053933,0.066758,0.060571,0.060381,0.060461,0.077893,0.062255,0.072598,0.057571,...,0.08453,0.0,0.0,0.094373,0.132891,0.319902,0.319902,0.319902,0.076778,0.049045
4,0.467682,0.488063,0.406266,0.434931,0.428338,0.232945,0.437811,0.439955,0.384563,0.351658,...,0.330459,0.0,0.0,0.20859,0.189819,0.0,0.0,0.0,0.454232,0.369566


# rmse
#### calculating rmse values to check how accurate our prediction are ?

In [39]:
def rmse(prediction,ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    #print(prediction[ground_truth.nonz].shape)
    return sqrt(mean_squared_error(prediction,ground_truth))


In [40]:
print(str(rmse(user_predict, test_data_matrix)))
print(str(rmse(item_predict, test_data_matrix)))

2.962053729457448
3.1663008539251694


### histogram plot

In [43]:
prediction =user_predict[test_data_matrix.nonzero()]
ground_truth = test_data_matrix[test_data_matrix.nonzero()]
trace0 = go.Histogram(
    x = (ground_truth - prediction)
)

data = [trace0]

py.iplot(data)
