In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import os

**Step 1**: Read the Dataset

In [3]:
file_path = '~/Google Drive/Group D/Recommendation Engines/Final_Project/'

pot_luck = pd.read_excel(os.path.join(file_path, 'pot_luck.xlsx'), header = 1, index_col = 0)
pot_luck.head()

Unnamed: 0,Sushi,Football,Hiking,Salsa,Spark,Beer,Karaoke,Books,Travel,Finance,...,Unnamed: 18,Agustin.1,Benjamin.1,Alice.1,Olivia.1,Unnamed: 23,Pred1,Pred2,Pred3,Pred4
Maria,1,0,1,0,1,1,0,0,0,1,...,,15.0,,,,,,,,
Elizabeth,0,1,1,1,0,0,0,1,0,0,...,,,,40.0,,,,,,
Beatriz,0,0,0,1,1,1,0,0,0,0,...,,,,,,,,,,
Ramona,0,0,1,1,0,0,1,1,0,0,...,,,,,,,,,,
Liz,0,1,0,0,0,0,0,0,1,1,...,,,2.0,,,,,,,


**Step 2**: Split the first twenty rows (which is the data about the questions)

In [4]:
pot_luck = pot_luck.iloc[0:20,:].fillna(0)

**Dataset 1**: This is the dataset with all the questions and their topics.

In [5]:
pot_luck_users = pot_luck.loc[:,'Sushi':'Finance'].T
pot_luck_users

Unnamed: 0,Maria,Elizabeth,Beatriz,Ramona,Liz,Concepcion,Carolina,Mabel,Raquel,Noemi,Liam,Noah,William,James,Logan,Benjamin,Mason,Elijah,Oliver,Jacob
Sushi,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0
Football,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0
Hiking,1,1,0,1,0,0,0,1,0,0,1,0,1,1,0,0,1,0,1,1
Salsa,0,1,1,1,0,1,0,1,0,0,0,0,1,1,1,0,1,1,0,1
Spark,1,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,1,0
Beer,1,0,1,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0
Karaoke,0,0,0,1,0,0,0,1,0,1,0,1,0,0,1,0,0,0,1,1
Books,0,1,0,1,0,0,1,0,0,0,0,0,1,0,1,0,1,0,0,0
Travel,0,0,0,0,1,0,0,0,1,0,1,0,0,1,0,1,0,1,0,1
Finance,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0


**Dataset 2**: This is the dataset with all the users and their likes and dislikes for each question

In [6]:
pot_luck_feedback = pot_luck.loc[:,'Agustin': 'Alice']
pot_luck_feedback

Unnamed: 0,Agustin,Benjamin,Alice
Maria,1.0,-1.0,0.0
Elizabeth,-1.0,1.0,0.0
Beatriz,0.0,0.0,0.0
Ramona,0.0,1.0,0.0
Liz,0.0,0.0,1.0
Concepcion,1.0,0.0,0.0
Carolina,0.0,0.0,-1.0
Mabel,0.0,0.0,1.0
Raquel,0.0,0.0,0.0
Noemi,0.0,0.0,0.0


**Dataset 3**: This is the dataset with all the users and their votes and upvotes

In [7]:
pot_luck_up_down = pot_luck.loc[:,'Agustin.1':'Alice.1']
pot_luck_up_down

Unnamed: 0,Agustin.1,Benjamin.1,Alice.1
Maria,15.0,0.0,0.0
Elizabeth,0.0,0.0,40.0
Beatriz,0.0,0.0,0.0
Ramona,0.0,0.0,0.0
Liz,0.0,2.0,0.0
Concepcion,25.0,0.0,0.0
Carolina,0.0,0.0,0.0
Mabel,0.0,-4.0,0.0
Raquel,0.0,0.0,0.0
Noemi,0.0,0.0,0.0


**Question 1**: The first part involves calculating the basic weights for each user and topics. For this problem, a simple matrix multiplication is carried out between the questions dataset and the feedback dataset. The column and row names are then assigned for readability. 

In [8]:
basic_profile = pd.DataFrame(np.dot(pot_luck_users,pot_luck_feedback))
basic_profile.columns = pot_luck_feedback.columns
basic_profile.index = pot_luck_users.index
basic_profile

Unnamed: 0,Agustin,Benjamin,Alice
Sushi,3.0,-2.0,-2.0
Football,-2.0,2.0,1.0
Hiking,-1.0,2.0,1.0
Salsa,0.0,3.0,0.0
Spark,0.0,-1.0,0.0
Beer,2.0,-2.0,-3.0
Karaoke,-1.0,0.0,-1.0
Books,-1.0,3.0,-2.0
Travel,1.0,0.0,0.0
Finance,0.0,-1.0,1.0


**Predictions 1**: To predict the questions using the basic profile, we now need to calculate the cosine similary between the questions dataset and the basic profile dataset. In order to do so, the columns in both datasets must be the same (in this case the topics) so we transpose our answers from above.  

In [9]:
basic_profile = basic_profile.T
questions_transposed = pot_luck_users.T

In [10]:
# Print the weights dataset
basic_profile

Unnamed: 0,Sushi,Football,Hiking,Salsa,Spark,Beer,Karaoke,Books,Travel,Finance
Agustin,3.0,-2.0,-1.0,0.0,0.0,2.0,-1.0,-1.0,1.0,0.0
Benjamin,-2.0,2.0,2.0,3.0,-1.0,-2.0,0.0,3.0,0.0,-1.0
Alice,-2.0,1.0,1.0,0.0,0.0,-3.0,-1.0,-2.0,0.0,1.0


In [11]:
# Print the questions dataset
questions_transposed

Unnamed: 0,Sushi,Football,Hiking,Salsa,Spark,Beer,Karaoke,Books,Travel,Finance
Maria,1,0,1,0,1,1,0,0,0,1
Elizabeth,0,1,1,1,0,0,0,1,0,0
Beatriz,0,0,0,1,1,1,0,0,0,0
Ramona,0,0,1,1,0,0,1,1,0,0
Liz,0,1,0,0,0,0,0,0,1,1
Concepcion,1,0,0,1,0,0,0,0,0,0
Carolina,0,0,0,0,0,0,0,1,0,1
Mabel,0,0,1,1,0,0,1,0,0,1
Raquel,0,0,0,0,0,1,0,0,1,0
Noemi,0,1,0,0,1,0,1,0,0,0


**Based on the Basic Profile, the predictions for each user are as follows**

In [12]:
predictions_basic = pd.DataFrame(cosine_similarity(questions_transposed, basic_profile))
predictions_basic.columns = basic_profile.index
predictions_basic.index = pot_luck.index
predictions_basic

Unnamed: 0,Agustin,Benjamin,Alice
Maria,0.39036,-0.2981424,-0.29277
Elizabeth,-0.4364358,0.8333333,0.0
Beatriz,0.2519763,1.387779e-17,-0.377964
Ramona,-0.3273268,0.6666667,-0.218218
Liz,-0.1259882,0.09622504,0.251976
Concepcion,0.46291,0.1178511,-0.308607
Carolina,-0.1543033,0.2357023,-0.154303
Mabel,-0.2182179,0.3333333,0.109109
Raquel,0.46291,-0.2357023,-0.46291
Noemi,-0.3779645,0.09622504,0.0


**Question 2**: The second part involves calculating the unit weights for each topic and question. This is achieved by dividing each element (topic x question) by the total number of responses in each row. For this problem, a simple matrix multiplication is carried out between the unit_weight and the feedback dataset. The column and row names are then assigned for readability. 

In [13]:
# calculation of unit weights

unit_weight = pot_luck_users / pot_luck_users.sum(axis =0)
unit_weight

Unnamed: 0,Maria,Elizabeth,Beatriz,Ramona,Liz,Concepcion,Carolina,Mabel,Raquel,Noemi,Liam,Noah,William,James,Logan,Benjamin,Mason,Elijah,Oliver,Jacob
Sushi,0.2,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0
Football,0.0,0.25,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.25,0.0,0.0,0.25,0.0,0.2,0.0
Hiking,0.2,0.25,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.333333,0.0,0.25,0.25,0.0,0.0,0.25,0.0,0.2,0.25
Salsa,0.0,0.25,0.333333,0.25,0.0,0.5,0.0,0.25,0.0,0.0,0.0,0.0,0.25,0.25,0.25,0.0,0.25,0.5,0.0,0.25
Spark,0.2,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.333333,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.2,0.0
Beer,0.2,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.333333,0.0,0.0,0.25,0.333333,0.0,0.0,0.0,0.0
Karaoke,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.25,0.0,0.333333,0.0,0.333333,0.0,0.0,0.25,0.0,0.0,0.0,0.2,0.25
Books,0.0,0.25,0.0,0.25,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.25,0.0,0.25,0.0,0.0,0.0
Travel,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.5,0.0,0.333333,0.0,0.0,0.25,0.0,0.333333,0.0,0.5,0.0,0.25
Finance,0.2,0.0,0.0,0.0,0.333333,0.0,0.5,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0


In [14]:
# unit weight matrix

unit_weight_calc = pd.DataFrame(np.dot(unit_weight,pot_luck_feedback))
unit_weight_calc.columns = pot_luck_feedback.columns
unit_weight_calc.index = pot_luck_users.index
unit_weight_calc


Unnamed: 0,Agustin,Benjamin,Alice
Sushi,1.033333,-0.533333,-0.666667
Football,-0.45,0.5,0.333333
Hiking,-0.25,0.55,0.25
Salsa,0.25,0.75,0.0
Spark,0.0,-0.2,0.0
Beer,0.533333,-0.533333,-0.916667
Karaoke,-0.2,-0.083333,-0.333333
Books,-0.25,0.75,-0.75
Travel,0.333333,0.0,0.0
Finance,0.0,-0.2,0.083333


**Predictions 2**: To predict the questions using the unit weights, we now need to calculate the cosine similary between the questions dataset and the unit weights dataset. In order to do so, the columns in both datasets must be the same (in this case the topics) so we transpose our answers from above.  

In [15]:
#print the unit weights transposed

unit_weight_trans = unit_weight_calc.T
unit_weight_trans

Unnamed: 0,Sushi,Football,Hiking,Salsa,Spark,Beer,Karaoke,Books,Travel,Finance
Agustin,1.033333,-0.45,-0.25,0.25,0.0,0.533333,-0.2,-0.25,0.333333,0.0
Benjamin,-0.533333,0.5,0.55,0.75,-0.2,-0.533333,-0.083333,0.75,0.0,-0.2
Alice,-0.666667,0.333333,0.25,0.0,0.0,-0.916667,-0.333333,-0.75,0.0,0.083333


In [16]:
# print the questions transposed
questions_transposed

Unnamed: 0,Sushi,Football,Hiking,Salsa,Spark,Beer,Karaoke,Books,Travel,Finance
Maria,1,0,1,0,1,1,0,0,0,1
Elizabeth,0,1,1,1,0,0,0,1,0,0
Beatriz,0,0,0,1,1,1,0,0,0,0
Ramona,0,0,1,1,0,0,1,1,0,0
Liz,0,1,0,0,0,0,0,0,1,1
Concepcion,1,0,0,1,0,0,0,0,0,0
Carolina,0,0,0,0,0,0,0,1,0,1
Mabel,0,0,1,1,0,0,1,0,0,1
Raquel,0,0,0,0,0,1,0,0,1,0
Noemi,0,1,0,0,1,0,1,0,0,0


**Based on the unit weight, the predictions for each user are as follows**

In [17]:
predictions_unit_weight = pd.DataFrame(cosine_similarity(questions_transposed, unit_weight_trans))
predictions_unit_weight.columns = unit_weight_trans.index
predictions_unit_weight.index = pot_luck.index
predictions_unit_weight

Unnamed: 0,Agustin,Benjamin,Alice
Maria,0.427934,-0.268373,-0.3822354
Elizabeth,-0.254363,0.834683,-0.05698029
Beatriz,0.328679,0.006299,-0.3618734
Ramona,-0.163519,0.643743,-0.2849014
Liz,-0.048952,0.113389,0.1644879
Concepcion,0.659494,0.100297,-0.3223292
Carolina,-0.128473,0.254601,-0.3223292
Mabel,-0.072675,0.332782,1.0408340000000001e-17
Raquel,0.445373,-0.246885,-0.4432026
Noemi,-0.272734,0.081892,0.0


**Question 2**: The third part involves calculating the IDF for each topic and question. This is achieved by dividing the count of each topic (number of documents) by the number of times each time a topic appears (number of documents where there the topic appears) and calculating the log of this division. For this problem, a multiplication is carried out between each element of the unit weights dataset (topic x user) and the IDFs for each topic.

In [18]:
IDF = np.log10(pot_luck_users.count(axis = 1)/pot_luck_users.sum(axis = 1))
IDF

Sushi       0.698970
Football    0.522879
Hiking      0.301030
Salsa       0.259637
Spark       0.522879
Beer        0.522879
Karaoke     0.455932
Books       0.522879
Travel      0.455932
Finance     0.602060
dtype: float64

In [19]:
idf_weights = unit_weight_calc.apply(lambda x: x* IDF)
idf_weights

Unnamed: 0,Agustin,Benjamin,Alice
Sushi,0.722269,-0.372784,-0.46598
Football,-0.235295,0.261439,0.174293
Hiking,-0.075257,0.165566,0.075257
Salsa,0.064909,0.194728,0.0
Spark,0.0,-0.104576,0.0
Beer,0.278869,-0.278869,-0.479306
Karaoke,-0.091186,-0.037994,-0.151977
Books,-0.13072,0.392159,-0.392159
Travel,0.151977,0.0,0.0
Finance,0.0,-0.120412,0.050172


**Predictions 3**: To predict the questions using the IDF, we now need to calculate the cosine similary between the questions dataset and the IDF dataset. In order to do so, the columns in both datasets must be the same (in this case the topics) so we transpose our answers from above.  

In [20]:
idf_trans = idf_weights.T

In [21]:
# Print the transposed dataset
idf_trans

Unnamed: 0,Sushi,Football,Hiking,Salsa,Spark,Beer,Karaoke,Books,Travel,Finance
Agustin,0.722269,-0.235295,-0.075257,0.064909,0.0,0.278869,-0.091186,-0.13072,0.151977,0.0
Benjamin,-0.372784,0.261439,0.165566,0.194728,-0.104576,-0.278869,-0.037994,0.392159,0.0,-0.120412
Alice,-0.46598,0.174293,0.075257,0.0,0.0,-0.479306,-0.151977,-0.392159,0.0,0.050172


In [22]:
# Print the questions dataset

questions_transposed

Unnamed: 0,Sushi,Football,Hiking,Salsa,Spark,Beer,Karaoke,Books,Travel,Finance
Maria,1,0,1,0,1,1,0,0,0,1
Elizabeth,0,1,1,1,0,0,0,1,0,0
Beatriz,0,0,0,1,1,1,0,0,0,0
Ramona,0,0,1,1,0,0,1,1,0,0
Liz,0,1,0,0,0,0,0,0,1,1
Concepcion,1,0,0,1,0,0,0,0,0,0
Carolina,0,0,0,0,0,0,0,1,0,1
Mabel,0,0,1,1,0,0,1,0,0,1
Raquel,0,0,0,0,0,1,0,0,1,0
Noemi,0,1,0,0,1,0,1,0,0,0


**Based on the unit weight, the predictions for each user are as follows**

In [23]:
predictions_idf = pd.DataFrame(cosine_similarity(questions_transposed, idf_trans))
predictions_idf.columns = idf_trans.index
predictions_idf.index = pot_luck.index
predictions_idf

Unnamed: 0,Agustin,Benjamin,Alice
Maria,0.490309,-0.436363,-0.450526
Elizabeth,-0.222832,0.695633,-0.087616
Beatriz,0.235027,-0.149509,-0.340032
Ramona,-0.13751,0.490191,-0.28807
Liz,-0.056961,0.111728,0.159241
Concepcion,0.659111,-0.172767,-0.404874
Carolina,-0.109453,0.263674,-0.297141
Mabel,-0.060115,0.138516,-0.016311
Raquel,0.360751,-0.270584,-0.416452
Noemi,-0.223202,0.094173,0.015831


In [24]:
# Top 5 user Agustin

predictions_idf.loc[(pot_luck_feedback[pot_luck_feedback['Agustin'] == 0].index), 'Agustin'].sort_values(ascending = False).head(5)

Noah       0.622096
Raquel     0.360751
Beatriz    0.235027
Elijah     0.181601
Logan      0.072156
Name: Agustin, dtype: float64

In [25]:
# Top 5 Benjamin

predictions_idf.loc[(pot_luck_feedback[pot_luck_feedback['Benjamin'] == 0].index),'Benjamin'].sort_values(ascending = False).head(5)

William     0.444510
James       0.426572
Carolina    0.263674
Jacob       0.221130
Elijah      0.188943
Name: Benjamin, dtype: float64

In [26]:
# Top 5 Alice 

predictions_idf.loc[(pot_luck_feedback[pot_luck_feedback['Alice'] == 0].index), 'Alice'].sort_values(ascending = False).head(5)

James     0.153319
Oliver    0.081188
Liam      0.053390
Noemi     0.015831
Elijah    0.000000
Name: Alice, dtype: float64