In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [6]:
#Reading ratings file:
train = pd.read_csv('train.csv')

#Reading Movie Info File
article_info = pd.read_csv('article_info.csv')

test= pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,user_id,article_id,rating
0,1,456,1
1,1,2934,1
2,1,82,1
3,1,1365,1
4,1,221,1


In [4]:
article_info.head()

Unnamed: 0,article_id,website,title,content
0,1025,uxmovement,Comment concevoir une procédure pas à pas que ...,par anthony le 18/07/16 à 8h02 Si une nouvelle...
1,2328,endeavor,Ressources humaines? Seulement si vous optez p...,"«Ambassadeurs», «avocats», «porte-parole» d'un..."
2,2469,linkedin,Deux motions de vente différentes. . . .,J'ai passé pas mal de temps récemment avec des...
3,2590,googleblog,Apprentissage large et profond: mieux avec Ten...,"""Apprenez les règles comme un pro, afin de pou..."
4,697,infoq,Agile: manque de compétences en tests,"Fran O'Hara, directeur et consultant principal..."


In [5]:
#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [7]:
test

Unnamed: 0,user_id,article_id
0,1,2607
1,1,1445
2,1,911
3,1,857
4,1,2062
...,...,...
7238,1087,2089
7239,1087,504
7240,1087,1801
7241,1087,967


In [8]:
#Define the baseline model to always return average of all available ratings
def baseline(user_id, movie):
    return train['rating'].mean()

In [9]:
train['rating'].mean()

1.4539477616400693

In [12]:
train = train.merge(article_info[['article_id','website']], how='left', left_on = 'article_id', right_on = 'article_id')

In [13]:
train

Unnamed: 0,user_id,article_id,rating,website
0,1,456,1,medium
1,1,2934,1,thestreet
2,1,82,1,facebook
3,1,1365,1,techcrunch
4,1,221,1,geekwire
...,...,...,...,...
16726,1087,2242,1,web-engineering
16727,1087,419,1,em
16728,1087,784,1,geeksaresexy
16729,1087,1249,1,attps


In [53]:
test = test.merge(article_info[['article_id','website']], how='left', left_on = 'article_id', right_on = 'article_id')

In [14]:
train['article_id_with_website'] = train['article_id'].map(str) + str(': ') + train['website'].map(str)

In [54]:
test['article_id_with_website'] = test['article_id'].map(str) + str(': ') + test['website'].map(str)

In [15]:
train

Unnamed: 0,user_id,article_id,rating,website,article_id_with_website
0,1,456,1,medium,456: medium
1,1,2934,1,thestreet,2934: thestreet
2,1,82,1,facebook,82: facebook
3,1,1365,1,techcrunch,1365: techcrunch
4,1,221,1,geekwire,221: geekwire
...,...,...,...,...,...
16726,1087,2242,1,web-engineering,2242: web-engineering
16727,1087,419,1,em,419: em
16728,1087,784,1,geeksaresexy,784: geeksaresexy
16729,1087,1249,1,attps,1249: attps


In [55]:
test = test.drop(['article_id', 'website'], axis = 1)

In [18]:
train

Unnamed: 0,user_id,rating,article_id_with_website
0,1,1,456: medium
1,1,1,2934: thestreet
2,1,1,82: facebook
3,1,1,1365: techcrunch
4,1,1,221: geekwire
...,...,...,...
16726,1087,1,2242: web-engineering
16727,1087,1,419: em
16728,1087,1,784: geeksaresexy
16729,1087,1,1249: attps


In [19]:
#Assign X as the original ratings dataframe
X = train.copy()

#Split into training and test datasets
X_train, X_val = train_test_split(X, test_size = 0.25, random_state=42)

In [20]:
X_val

Unnamed: 0,user_id,rating,article_id_with_website
15564,1003,1,1368: businessinsider
14110,901,1,467: caroli
6827,460,2,930: computerworld
8156,525,2,1631: cio
2214,148,1,2361: kaczmarzyk
...,...,...,...
7578,503,1,2873: googleblog
921,48,1,583: uol
13348,837,1,322: grammarly
2205,148,5,1025: uxmovement


In [21]:
#Define the baseline model to always return average of all available ratings
def baseline(user_id, article_id_with_website):
    return X_train['rating'].mean()

In [22]:
X_train['rating'].mean()

1.4631016895122728

In [27]:
#Function to compute the RMSE score obtained on the test set by a model
def rmse_score(model):
    
    #Construct a list of user-movie tuples from the test dataset
    id_pairs = zip(X_val['user_id'], X_val['article_id_with_website'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([model(user, article_id_with_website) for (user, article_id_with_website) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_val['rating'])
    
    #Return the final RMSE score
    
    return rmse(y_true, y_pred)

In [28]:
rmse_score(baseline)

0.9683927490470934

In [30]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='article_id_with_website')

r_matrix.head()

article_id_with_website,1000: wired,1002: b9,1003: blogspot,1004: bitcoin,1005: bbc,1006: wordpress,1007: portalnovidade,1008: ieee,1009: darpa,100: technologyreview,...,990: linkedin,991: startupi,992: newmediadenver,994: blog,995: businessinsider,996: googlediscovery,997: instructables,998: linkedin,99: diolinux,9: kinsta
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,1.0,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,


In [31]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, article_id_with_website):
    
    #Check if movie exists in r_matrix
    if article_id_with_website in r_matrix:
        
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[article_id_with_website].mean()
    
    else:
        #Default to average rating from the train set
        mean_rating = X_train['rating'].mean()
    
    return mean_rating

In [32]:
#Compute RMSE for the Mean model
rmse_score(cf_user_mean)

1.0420497615166175

In [33]:
#Compute the Pearson Correlation using the ratings matrix with corr function from Pandas
pearson_corr = r_matrix.T.corr()

In [34]:
#Convert into pandas dataframe 
pearson_corr = pd.DataFrame(pearson_corr, index=r_matrix.index, columns=r_matrix.index)

pearson_corr.head(10)

user_id,1,2,3,5,7,8,9,10,11,12,...,1078,1079,1080,1081,1082,1083,1084,1085,1086,1087
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,1.0,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,1.0,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,1.0,...,,,,,,,,,,


In [35]:
#Fill all the missing correlations with 0
pearson_cor = pearson_corr.fillna(0)

In [36]:
r_matrix.head()

article_id_with_website,1000: wired,1002: b9,1003: blogspot,1004: bitcoin,1005: bbc,1006: wordpress,1007: portalnovidade,1008: ieee,1009: darpa,100: technologyreview,...,990: linkedin,991: startupi,992: newmediadenver,994: blog,995: businessinsider,996: googlediscovery,997: instructables,998: linkedin,99: diolinux,9: kinsta
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,1.0,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,


In [39]:
def cf_user_wmean(user_id, article_id_with_website):
    
    #Check if movie_id exists in r_matrix
    if article_id_with_website in r_matrix:
        
        #Mean rating for active user
        ra = r_matrix.loc[user_id].mean()

        #Get the similarity scores for the user in question with every other user
        sim_scores = pearson_corr[user_id].sort_values(ascending = False)
        
        # Keep similarity scores for users with positive correlation with active user
        sim_scores_pos = sim_scores[sim_scores > 0]
        
        #Get the user ratings for the movie in question
        m_ratings = r_matrix[article_id_with_website][sim_scores_pos.index]
        
        
        
        
        #Extract the indices containing NaN in the m_ratings series (Users who have not rated the target movie)
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        # If there are no ratings from similar users we cannot use this method so we predict just 
        # the average rating of the movie else we use the prediction formula
        if len(m_ratings) == 0:
            #Default to average rating in the absence of ratings by similar users
            wmean_rating = r_matrix[article_id_with_website].mean()
        else:   
            #Drop the corresponding correlation scores from the sim_scores series
            sim_scores_pos = sim_scores_pos.drop(idx)
            
            #Subtract average rating of each user from the rating (rbp - mean(rb))
            m_ratings = m_ratings - r_matrix.loc[m_ratings.index].mean(axis = 1)
            
            #Compute the final weighted mean using np.dot which is nothing but the product divided by sum of weights
            wmean_rating = ra + (np.dot(sim_scores_pos, m_ratings)/ sim_scores_pos.sum())
   
    else:
        #Default to average rating in the absence of any information on the movie in train set
        wmean_rating = X_train['rating'].mean()
    
    return wmean_rating

In [40]:
rmse_score(cf_user_wmean)

1.0931141209445436

In [49]:
test.shape

(7243, 2)

In [56]:
id_pairs = zip(test['user_id'], test['article_id_with_website'])
    
    #Predict the rating for every user-movie tuple
y_pred = np.array([cf_user_wmean(user, article_id_with_website) for (user, article_id_with_website) in id_pairs])
    

In [65]:
id_pairs.head()

AttributeError: 'zip' object has no attribute 'head'

In [57]:
y_pred

array([1.375     , 1.2       , 1.30612245, ..., 1.        , 1.42424242,
       1.53846154])

In [58]:
df = pd.DataFrame(y_pred, 
             columns=['rating'])

In [59]:
df.shape

(7243, 1)

In [60]:
result = pd.concat([test, df], axis=1)

In [61]:
result.tail()

Unnamed: 0,user_id,article_id_with_website,rating
7238,1087,2089: blogspot,1.375
7239,1087,504: canaltech,1.571429
7240,1087,1801: convergecom,1.0
7241,1087,967: cnbc,1.424242
7242,1087,857: caelum,1.538462


In [62]:
csv_data = result.to_csv()

In [63]:
result.to_csv('lokesh_articles_assignment.csv')