
<h2><font color='#3498db'>Daily Activity Recommender System</font></h2>

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np




# Collaborative Filtering

The most prominent approach to generate recommendations:
- used by large, commercial e-commerce sites
- well-understood, various algorithms and variations exist
- applicable in many domains (book, movies, DVDs, ..)

Approach
- use the "wisdom of the crowd" to recommend items
 
 Basic assumption and idea
- Users give ratings to catalog items (implicitly or explicitly)
- Customers who had similar tastes in the past, will have similar tastes in the
future

## User-based nearest-neighbor collaborative filtering (1)

The basic technique:
- Given an "active user" (Alice) and an item I not yet seen by Alice
- The goal is to estimate Alice's rating for this item, e.g., by 
   -  find a set of users (peers) who liked the same items as Alice in the past and
who have rated item I
   - use, e.g. the average of their ratings to predict, if Alice will like item I
   - do this for all items Alice has not seen and recommend the best-rated

(on this context: Identify the active user and the target activity: The active user is the person for whom we want to make recommendations, and the target activity is an activity that the active user has not yet rated.

Find the peers: The peers are users who have similar preferences to the active user. This similarity can be determined using various measures, such as Pearson correlation or cosine similarity. The peers are typically those who have rated many of the same activities as the active user and whose ratings strongly correlate with the active user’s ratings.

Estimate the active user’s rating for the target activity: This is typically done by taking a weighted average of the peers’ ratings for the target activity. The weights are usually the similarity scores between the active user and each peer. If a peer has a higher similarity score with the active user, their rating for the target activity will have a greater influence on the estimated rating.

Recommend activities: Repeat step 3 for all activities that the active user has not yet rated. Then, recommend the activities with the highest estimated ratings.

In this way, this algorithm leverages the “wisdom of the crowd” to make personalized recommendations for each user. It assumes that if two users agree on one issue (an activity), they are likely to agree on others as well.)

In [None]:
import pandas as pd
from scipy.spatial.distance import correlation
from sklearn.metrics.pairwise import pairwise_distances

df = pd.read_csv('dataset_activities.csv')
df = df[['Name', 'Activity']]

# Create a user-item matrix
user_item_matrix = df.pivot_table(index='Name', columns='Activity', aggfunc=len, fill_value=0)

# Calculate the similarity between users
user_similarity = 1 - pairwise_distances(user_item_matrix.values, metric='correlation')
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

def recommend_activities(user, n_recommendations):
    # Get the users sorted by similarity to the input user
    similar_users = user_similarity_df[user].sort_values(ascending=False)
    
    # Get the activities rated by the input user
    user_activities = set(df[df['Name'] == user]['Activity'])
    
    recommendations = {}
    
    for similar_user in similar_users.index:
        if similar_user == user: continue  # Skip the input user
        
        # Get the activities rated by the similar user
        similar_user_activities = set(df[df['Name'] == similar_user]['Activity'])
        
        # Get the activities not yet seen by the input user
        new_activities = similar_user_activities - user_activities
        
        for activity in new_activities:
            if activity not in recommendations:
                recommendations[activity] = similar_users[similar_user]
            else:
                recommendations[activity] += similar_users[similar_user]
    
    # Sort the recommendations by total similarity of users who rated each activity
    sorted_recommendations = sorted(recommendations.items(), key=lambda x: x[1], reverse=True)
    
    return sorted_recommendations[:n_recommendations]




### Evaluations with test

In [None]:
print('Alice, who likes outdoor_activities: ', recommend_activities('Alice', 5))
print('John, who likes physical_activities: ',recommend_activities('John', 5))
print('Jane, who likes artistic_activities: ',recommend_activities('Jane', 5))
print('Doe, who likes night_activities: ',recommend_activities('Doe', 5))
print('Bob, who likes water_activities: ',recommend_activities('Bob', 5))
print('Charlie, who likes sports_activities: ',recommend_activities('Charlie', 5))
print('Eve, who likes home_activities: ',recommend_activities('Eve', 5))
print('Mallory, who likes extreme_activities: ',recommend_activities('Mallory', 5))


Alice, who likes outdoor_activities:  [('Surfing', 1.7455437562764877), ('Rafting', 1.7455437562764877), ('Scuba Diving ', 1.7455437562764877), ('Massage ', 1.4092942817723075), ('Facial Treatment', 1.4092942817723075)]
John, who likes physical_activities:  [('Kite Flying', 1.9384828877177775), ('Skiing ', 1.9384828877177775), ('Kayaking', 1.78122851618871), ('Surfing', 1.750492342298556), ('Scuba Diving ', 1.750492342298556)]
Jane, who likes artistic_activities:  [('Beach Volleyball', 1.8857701003741063), ('Kite Flying', 1.6933393949814877), ('Skiing ', 1.6933393949814877), ('Kayaking', 1.608708024180279), ('Surfing', 1.5156538744049295)]
Doe, who likes night_activities:  [('Meditation Retreat ', 2.587913384404243), ('Beach Volleyball', 2.5160807726041), ('Kite Flying', 2.2882538064327904), ('Skiing ', 2.2882538064327904), ('Kayaking', 2.171845692026657)]
Bob, who likes water_activities:  [('Skiing ', 1.7306406263562555), ('Kite Flying', 1.7306406263562555), ('Beach Volleyball', 1.673

The numbers next to each activity (e.g., 1.7455437562764877 for ‘Scuba Diving’) are the scores for each recommended activity. These scores are calculated as the sum of similarities between the user and all other users who have liked each activity. The higher the score, the stronger the recommendation.

No knowledgeengineering effort, serendipity of results, learns market segments

# Item-based collaborative filtering

Basic idea:
- Use the similarity between items (and not users) to make predictions

Example:
- Look for items that are similar to Item5
- Take Alice's ratings for these items to predict the rating for Item5

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

df = pd.read_csv('dataset_activities.csv')

activity_matrix = pd.get_dummies(df['Activity'])
user_activity_matrix = pd.concat([df['Name'], activity_matrix], axis=1)


sparse_matrix = sparse.csr_matrix(user_activity_matrix.drop('Name', axis=1).values)

# Calculate cosine similarity
item_similarity = cosine_similarity(sparse_matrix.T)

# Wrap the similarities in a DataFrame
item_similarity_df = pd.DataFrame(item_similarity, index=user_activity_matrix.columns[1:], columns=user_activity_matrix.columns[1:])

def recommend_activities1(user, top_n=5):
    # Get the activities done by the user
    user_activities = df[df['Name'] == user]['Activity']

    # Calculate a similarity score for each activity
    similarity_score = item_similarity_df[user_activities].mean(axis=1)

    # Get the top n activities
    top_activities = similarity_score.nlargest(top_n)

    return top_activities.index.tolist()


In [None]:
print('Alice, who likes outdoor_activities: ', recommend_activities1('Alice', 5))
print('John, who likes physical_activities: ',recommend_activities1('John', 5))
print('Jane, who likes artistic_activities: ',recommend_activities1('Jane', 5))
print('Doe, who likes night_activities: ',recommend_activities1('Doe', 5))
print('Bob, who likes water_activities: ',recommend_activities1('Bob', 5))
print('Charlie, who likes sports_activities: ',recommend_activities1('Charlie', 5))
print('Eve, who likes home_activities: ',recommend_activities1('Eve', 5))
print('Mallory, who likes extreme_activities: ',recommend_activities1('Mallory', 5))


Alice, who likes outdoor_activities:  ['Stargazing', 'Hiking ', 'Walking the dog', 'Movie Watching', 'Potluck Dinner']
John, who likes physical_activities:  ['Walking the dog', 'Movie Watching', 'Potluck Dinner', 'Stargazing', 'Hiking ']
Jane, who likes artistic_activities:  ['Movie Watching', 'Potluck Dinner', 'Stargazing', 'Hiking ', 'Walking the dog']
Doe, who likes night_activities:  ['Movie Watching', 'Walking the dog', 'Potluck Dinner', 'Hiking ', 'Stargazing']
Bob, who likes water_activities:  ['Stargazing', 'Hiking ', 'Walking the dog', 'Movie Watching', 'Potluck Dinner']
Charlie, who likes sports_activities:  ['Hiking ', 'Walking the dog', 'Stargazing', 'Potluck Dinner', 'Movie Watching']
Eve, who likes home_activities:  ['Potluck Dinner', 'Hiking ', 'Movie Watching', 'Stargazing', 'Walking the dog']
Mallory, who likes extreme_activities:  ['Movie Watching', 'Scuba Diving ', 'Walking the dog', 'Hiking ', 'Stargazing']


This is not performing really well

Let's add some randomness into the recommendations, while still keeping the cosine similarity as the main factor

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

activity_matrix = pd.get_dummies(df['Activity'])
user_activity_matrix = pd.concat([df['Name'], activity_matrix], axis=1)


sparse_matrix = sparse.csr_matrix(user_activity_matrix.drop('Name', axis=1).values)

# Calculate cosine similarity
item_similarity = cosine_similarity(sparse_matrix.T)

# Wrap the similarities in a DataFrame
item_similarity_df = pd.DataFrame(item_similarity, index=user_activity_matrix.columns[1:], columns=user_activity_matrix.columns[1:])



def recommend_activities2(user, top_n=5):
    # Get the activities done by the user
    user_activities = df[df['Name'] == user]['Activity']

    # Calculate a similarity score for each activity
    similarity_score = item_similarity_df[user_activities].mean(axis=1)

    # Introduce some randomness
    random_factor = np.random.rand(len(similarity_score))
    adjusted_similarity_score = similarity_score * random_factor

    # Get the top n activities
    top_activities = adjusted_similarity_score.nlargest(top_n)

    return top_activities.index.tolist()


# Test the function
print(recommend_activities2('Alice'))

['Hiking ', 'Movie Watching', 'Walking the dog', 'Stargazing', 'Fruit Picking']


In [None]:
print('Alice, who likes outdoor_activities: ', recommend_activities2('Alice', 5))
print('John, who likes physical_activities: ',recommend_activities2('John', 5))
print('Jane, who likes artistic_activities: ',recommend_activities2('Jane', 5))
print('Doe, who likes night_activities: ',recommend_activities2('Doe', 5))
print('Bob, who likes water_activities: ',recommend_activities2('Bob', 5))
print('Charlie, who likes sports_activities: ',recommend_activities2('Charlie', 5))
print('Eve, who likes home_activities: ',recommend_activities2('Eve', 5))
print('Mallory, who likes extreme_activities: ',recommend_activities2('Mallory', 5))

Alice, who likes outdoor_activities:  ['Stargazing', 'Walking the dog', 'Movie Watching', 'Fencing ', 'Antique Shopping']
John, who likes physical_activities:  ['Hiking ', 'Walking the dog', 'Stargazing', 'Spa Day  ', 'Tennis']
Jane, who likes artistic_activities:  ['Movie Watching', 'Walking the dog', 'Hiking ', 'Magic Show Attendance', 'Potluck Dinner']
Doe, who likes night_activities:  ['Potluck Dinner', 'Movie Watching', 'Stargazing', 'Hiking ', 'Walking the dog']
Bob, who likes water_activities:  ['Potluck Dinner', 'Whale Watching', 'Stargazing', 'Canoeing', 'Swimming']
Charlie, who likes sports_activities:  ['Stargazing', 'Hiking ', 'Walking the dog', 'Potluck Dinner', 'Skiing ']
Eve, who likes home_activities:  ['Movie Watching', 'Walking the dog', 'Potluck Dinner', 'Hot Spring Bath', 'Painting ']
Mallory, who likes extreme_activities:  ['Walking the dog', 'Stargazing', 'Kite Flying', 'Ice Skating', 'Movie Watching']


Is it possible that our dataset is sparse? Let's see

In [None]:

total_possible_interactions = df['Name'].nunique() * df['Activity'].nunique()
actual_interactions = df.shape[0]
sparsity = 1 - actual_interactions / total_possible_interactions
print(f'Sparsity: {sparsity}')


Sparsity: -9.964912280701755


In [None]:
duplicates = df.duplicated(subset=['Name', 'Activity'])
print(f'Number of duplicate entries: {duplicates.sum()}')


Number of duplicate entries: 9797


The number of duplicate entries in the dataset is quite high. This means that many user-activity pairs appear more than once. This could be the reason why the sparsity was calculated as a negative value.

Instead of treating each occurrence of an activity as a separate interaction, we could aggregate them and treat each unique user-activity pair as a single interaction. (nstead of counting each time someone did something, combine all the times someone did the same thing. So, if a person practice an activity several times, instead of counting each time separately, i can count it as a single occurrence. )

In [None]:
df = pd.read_csv('dataset_activities.csv')
df_agg = df.groupby(['Name', 'Activity']).size().reset_index(name='Count')
# Assuming df_agg is your aggregated DataFrame and 'Activity' is the column you want to recommend
activity_matrix = pd.get_dummies(df_agg['Activity'])
user_activity_matrix = pd.concat([df_agg['Name'], activity_matrix], axis=1)

# Create a sparse matrix
sparse_matrix = sparse.csr_matrix(user_activity_matrix.drop('Name', axis=1).values)

# Calculate cosine similarity
item_similarity = cosine_similarity(sparse_matrix.T)

# Wrap the similarities in a DataFrame
item_similarity_df = pd.DataFrame(item_similarity, index=user_activity_matrix.columns[1:], columns=user_activity_matrix.columns[1:])

def recommend_activities3(user, top_n=5):
    # Get the activities done by the user
    user_activities = set(df_agg[df_agg['Name'] == user]['Activity'])

    # Calculate a similarity score for each activity
    similarity_score = item_similarity_df[list(user_activities)].mean(axis=1)

    # Get the top n activities
    top_activities = similarity_score.nlargest(top_n)

    return top_activities.index.tolist()







In [None]:
print('Alice, who likes outdoor_activities: ', recommend_activities3('Alice', 5))
print('John, who likes physical_activities: ',recommend_activities3('John', 5))
print('Jane, who likes artistic_activities: ',recommend_activities3('Jane', 5))
print('Doe, who likes night_activities: ',recommend_activities3('Doe', 5))
print('Bob, who likes water_activities: ',recommend_activities3('Bob', 5))
print('Charlie, who likes sports_activities: ',recommend_activities3('Charlie', 5))
print('Eve, who likes home_activities: ',recommend_activities3('Eve', 5))
print('Mallory, who likes extreme_activities: ',recommend_activities3('Mallory', 5))

Alice, who likes outdoor_activities:  ['Beach Volleyball', 'Kayaking', 'Kite Flying', 'Meditation Retreat ', 'Skiing ']
John, who likes physical_activities:  ['Beach Volleyball', 'Meditation Retreat ', 'Dancing', 'Gym Workout', 'Hair Salon Visit ']
Jane, who likes artistic_activities:  ['Meditation Retreat ', 'Art Class', 'Art Gallery Visit', 'Ballet Attendance', 'Book Club Participation']
Doe, who likes night_activities:  ['Hiking ', 'Movie Watching', 'Potluck Dinner', 'Stargazing', 'Walking the dog']
Bob, who likes water_activities:  ['Kayaking', 'Rafting', 'Scuba Diving ', 'Surfing', 'Canoeing']
Charlie, who likes sports_activities:  ['Beach Volleyball', 'Kite Flying', 'Rafting', 'Scuba Diving ', 'Skiing ']
Eve, who likes home_activities:  ['Baking ', 'Cooking ', 'Gardening ', 'Hiking ', 'Meditation ']
Mallory, who likes extreme_activities:  ['Kayaking', 'Kite Flying', 'Rafting', 'Scuba Diving ', 'Skiing ']


Wow, it was perfect

# Content-based recommendation

### Collaborative filtering does NOT require any information about the items

## Simple approach 
#### Compute the similarity of an unseen item with the user profile based on the keyword overlap (e.g. using the Dice coefficient)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
df = pd.read_csv('dataset_activities.csv')
# Combine all the relevant columns into a single 'content' column
df['content'] = df[['Likes', 'Activity', 'Weather']].apply(lambda x: ' '.join(x), axis=1)

# Create a CountVectorizer to create a matrix of token counts
count = CountVectorizer()
count_matrix = count.fit_transform(df['content'])

# Compute the Cosine Similarity matrix (which is equivalent to Dice coefficient for binary data)
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# Function that takes in a user name as input and outputs most similar users
def get_recommendations(user, cosine_sim=cosine_sim):
    # Get the index of the user that matches the name
    idx = df[df['Name'] == user].index[0]

    # Get the pairwsie similarity scores of all users with that user
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Exclude the input user itself from the similarity scores
    sim_scores = [sim_score for sim_score in sim_scores if df.iloc[sim_score[0]]['Name'] != user]

    # Sort the users based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Print out the similarity scores
    

    # Get the scores of the 10 most similar users
    sim_scores = sim_scores[:10]

    # Get the user indices
    user_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar users
    return df['Name'].iloc[user_indices]




In [None]:
print('Alice, who likes outdoor_activities: ', get_recommendations('Alice'))
print('John, who likes physical_activities: ',get_recommendations('John'))
print('Jane, who likes artistic_activities: ',get_recommendations('Jane'))
print('Doe, who likes night_activities: ',get_recommendations('Doe'))
print('Bob, who likes water_activities: ',get_recommendations('Bob'))
print('Charlie, who likes sports_activities: ',get_recommendations('Charlie'))
print('Eve, who likes home_activities: ',get_recommendations('Eve'))
print('Mallory, who likes extreme_activities: ',get_recommendations('Mallory'))

Alice, who likes outdoor_activities:  6      Charlie
39         Doe
46         Doe
52         Eve
60        Jane
66         Eve
146    Charlie
341        Doe
356    Charlie
369        Doe
Name: Name, dtype: object
John, who likes physical_activities:  15         Doe
76        Jane
135    Mallory
141       Jane
165        Doe
265        Doe
354    Charlie
357      Alice
434        Bob
442        Doe
Name: Name, dtype: object
Jane, who likes artistic_activities:  24        John
72     Mallory
86         Doe
115        Doe
124    Mallory
136    Charlie
235       John
239        Bob
300    Charlie
577       John
Name: Name, dtype: object
Doe, who likes night_activities:  65        Jane
90     Mallory
122      Alice
450      Alice
467      Alice
475        Eve
532        Bob
560       John
684       Jane
708       John
Name: Name, dtype: object
Bob, who likes water_activities:  298     Eve
2258    Eve
2532    Eve
4142    Eve
6970    Eve
7996    Eve
8461    Eve
981     Eve
1672    Eve
1736  

In [None]:
def get_recommendations2(user, cosine_sim=cosine_sim):
    # Get the index of the user that matches the name
    idx = df[df['Name'] == user].index[0]

    # Get the pairwsie similarity scores of all users with that user
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Exclude the input user itself from the similarity scores
    sim_scores = [sim_score for sim_score in sim_scores if df.iloc[sim_score[0]]['Name'] != user]

    # Sort the users based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar users
    sim_scores = sim_scores[:5]

    # Get the user indices
    user_indices = [i[0] for i in sim_scores]

    # Get the activities of the top 10 most similar users
    recommended_activities = df['Activity'].iloc[user_indices]

    return recommended_activities

print('Alice, who likes outdoor_activities: ', get_recommendations2('Alice'))
print('John, who likes physical_activities: ',get_recommendations2('John'))
print('Jane, who likes artistic_activities: ',get_recommendations2('Jane'))
print('Doe, who likes night_activities: ',get_recommendations2('Doe'))
print('Bob, who likes water_activities: ',get_recommendations2('Bob'))
print('Charlie, who likes sports_activities: ',get_recommendations2('Charlie'))
print('Eve, who likes home_activities: ',get_recommendations2('Eve'))
print('Mallory, who likes extreme_activities: ',get_recommendations2('Mallory'))


Alice, who likes outdoor_activities:  6     Movie Watching
39    Movie Watching
46    Movie Watching
52    Movie Watching
60    Movie Watching
Name: Activity, dtype: object
John, who likes physical_activities:  15     Hiking 
76     Hiking 
135    Hiking 
141    Hiking 
165    Hiking 
Name: Activity, dtype: object
Jane, who likes artistic_activities:  24     Movie Watching
72     Movie Watching
86     Movie Watching
115    Movie Watching
124    Movie Watching
Name: Activity, dtype: object
Doe, who likes night_activities:  65     Hiking 
90     Hiking 
122    Hiking 
450    Hiking 
467    Hiking 
Name: Activity, dtype: object
Bob, who likes water_activities:  298     Hot Spring Bath
2258    Hot Spring Bath
2532    Hot Spring Bath
4142    Hot Spring Bath
6970    Hot Spring Bath
Name: Activity, dtype: object
Charlie, who likes sports_activities:  530     Kite Flying
1120    Kite Flying
1630    Kite Flying
1706    Kite Flying
1789    Kite Flying
Name: Activity, dtype: object
Eve, who likes

## Term-Frequency - Inverse Document Frequency (TF-IDF)

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['Activity'])

# Compute the Cosine Similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_recommendations3(user, cosine_sim=cosine_sim):
    # Get the index of the user that matches the name
    idx = df[df['Name'] == user].index[0]

    # Get the pairwsie similarity scores of all users with that user
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Exclude the input user itself from the similarity scores
    sim_scores = [sim_score for sim_score in sim_scores if df.iloc[sim_score[0]]['Name'] != user]

    # Sort the users based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar users
    sim_scores = sim_scores[:5]

    # Get the user indices
    user_indices = [i[0] for i in sim_scores]

    # Get the activities of the top 10 most similar users
    recommended_activities = df['Activity'].iloc[user_indices]

    return recommended_activities


print('Alice, who likes outdoor_activities: ', get_recommendations3('Alice'))
print('John, who likes physical_activities: ',get_recommendations3('John'))
print('Jane, who likes artistic_activities: ',get_recommendations3('Jane'))
print('Doe, who likes night_activities: ',get_recommendations3('Doe'))
print('Bob, who likes water_activities: ',get_recommendations3('Bob'))
print('Charlie, who likes sports_activities: ',get_recommendations3('Charlie'))
print('Eve, who likes home_activities: ',get_recommendations3('Eve'))
print('Mallory, who likes extreme_activities: ',get_recommendations3('Mallory'))



Alice, who likes outdoor_activities:  6     Movie Watching
7     Movie Watching
19    Movie Watching
24    Movie Watching
39    Movie Watching
Name: Activity, dtype: object
John, who likes physical_activities:  5     Hiking 
15    Hiking 
65    Hiking 
71    Hiking 
76    Hiking 
Name: Activity, dtype: object
Jane, who likes artistic_activities:  0     Movie Watching
6     Movie Watching
19    Movie Watching
24    Movie Watching
39    Movie Watching
Name: Activity, dtype: object
Doe, who likes night_activities:  12     Hiking 
65     Hiking 
76     Hiking 
90     Hiking 
122    Hiking 
Name: Activity, dtype: object
Bob, who likes water_activities:  298     Hot Spring Bath
981     Hot Spring Bath
1672    Hot Spring Bath
1736    Hot Spring Bath
2004    Hot Spring Bath
Name: Activity, dtype: object
Charlie, who likes sports_activities:  92     Kite Flying
333    Kite Flying
367    Kite Flying
430    Kite Flying
451    Kite Flying
Name: Activity, dtype: object
Eve, who likes home_activitie

## Term-Frequency - Inverse Document Frequency (TF-IDF) with nearest neighbors

In [None]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['content'])

# Use NearestNeighbors from sklearn.neighbors to find nearest neighbors
nn = NearestNeighbors(n_neighbors=5, metric='euclidean')
nn.fit(tfidf_matrix)

# Function that takes in a user name as input and outputs most similar users
def get_recommendations4(user):
    # Get the index of the user that matches the name
    idx = df[df['Name'] == user].index[0]

    # Use kneighbors to get the n nearest neighbors
    distances, indices = nn.kneighbors(tfidf_matrix[idx], n_neighbors=6)

    # Exclude the input user itself from the recommendations
    indices = indices[indices != idx]

    # Return the top 5 most similar users
    return df['Activity'].iloc[indices]

print('Alice, who likes outdoor_activities: ', get_recommendations4('Alice'))
print('John, who likes physical_activities: ',get_recommendations4('John'))
print('Jane, who likes artistic_activities: ',get_recommendations4('Jane'))
print('Doe, who likes night_activities: ',get_recommendations4('Doe'))
print('Bob, who likes water_activities: ',get_recommendations4('Bob'))
print('Charlie, who likes sports_activities: ',get_recommendations4('Charlie'))
print('Eve, who likes home_activities: ',get_recommendations4('Eve'))
print('Mallory, who likes extreme_activities: ',get_recommendations4('Mallory'))


Alice, who likes outdoor_activities:  1536    Movie Watching
2151    Movie Watching
580     Movie Watching
2263    Movie Watching
513     Movie Watching
Name: Activity, dtype: object
John, who likes physical_activities:  566     Hiking 
1564    Hiking 
1295     Hiking
572     Hiking 
1224    Hiking 
Name: Activity, dtype: object
Jane, who likes artistic_activities:  1016    Movie Watching
3996    Movie Watching
4523    Movie Watching
3146    Movie Watching
4858    Movie Watching
Name: Activity, dtype: object
Doe, who likes night_activities:  149    Hiking 
215    Hiking 
306    Hiking 
501    Hiking 
178    Hiking 
Name: Activity, dtype: object
Bob, who likes water_activities:  2868    Hot Spring Bath
1523    Hot Spring Bath
2463    Hot Spring Bath
1873    Hot Spring Bath
2487    Hot Spring Bath
Name: Activity, dtype: object
Charlie, who likes sports_activities:  779     Kite Flying
6763    Kite Flying
211     Kite Flying
1278    Kite Flying
812     Kite Flying
Name: Activity, dtype: o

# Knowledge-based recommendation systems 
are a type of recommender system that suggest items based on explicit knowledge about the item or user preferences.

 They are particularly useful when there is little past interaction data available for a user.

Deterministic recommendations, assured quality, no coldstart, can resemble sales dialogue

In [None]:
class KnowledgeBasedRecommender:
    def __init__(self, df):
        self.df = df

    def recommend(self, preferences):
        # Filter the dataframe based on the user's preferences
        for key, value in preferences.items():
            self.df = self.df[self.df[key] == value]

        # Return the recommended activities
        return self.df['Activity']

df = pd.read_csv('dataset_activities.csv')
# Initialize the recommender with dataframe
recommender = KnowledgeBasedRecommender(df)

# Define the user's preferences
preferences = {
    'Likes': 'extreme_activities',
    'Weather': 'Snowy',
    
}

# Get the recommendations
recommendations = recommender.recommend(preferences)
print(recommendations.head(5))


87      Rock Climbing 
99     Walking the dog
152            Hiking 
227    Walking the dog
246     Potluck Dinner
Name: Activity, dtype: object


## Using Similarity functions (fuzzywuzzy)

In [None]:
from fuzzywuzzy import fuzz

class KnowledgeBasedRecommender:
    def __init__(self, df):
        self.df = df

    def recommend(self, preferences):
        scores = []

        # Calculate the similarity score for each item
        for _, row in self.df.iterrows():
            score = 0
            for key, value in preferences.items():
                score += fuzz.ratio(str(row[key]), value)
            scores.append(score / len(preferences))

        # Add the similarity scores to the dataframe
        self.df['Score'] = scores

        # Sort the dataframe by the similarity scores
        self.df = self.df.sort_values('Score', ascending=False)

        # Return the recommended activities
        return self.df['Activity']

# Initialize the recommender with dataframe
recommender = KnowledgeBasedRecommender(df)

# Define the user's preferences
preferences = {
    'Likes': 'extreme_activities',
    'Weather': 'Snowy',
}

# Get the recommendations
recommendations = recommender.recommend(preferences)
print(recommendations.head(5))




7594    Walking the dog
2144         Stargazing
6259            Skiing 
4533            Surfing
405      Potluck Dinner
Name: Activity, dtype: object


## Utility-based RS § E.g. MAUT – Multi-attribute utility theory

In [9]:
class MAUTRecommender:
    def __init__(self, df):
        self.df = df

    def recommend(self, preferences, weights):
        scores = []

        # Calculate the utility score for each item
        for _, row in self.df.iterrows():
            score = 0
            for key, value in preferences.items():
                if str(row[key]) == value:
                    score += weights[key]
            scores.append(score)

        # Add the utility scores to the dataframe
        self.df['Score'] = scores

        # Sort the dataframe by the utility scores
        self.df = self.df.sort_values('Score', ascending=False)

        # Return the recommended activities
        return self.df['Activity']


df = pd.read_csv('dataset_activities.csv')
# Initialize the recommender with dataframe
recommender = MAUTRecommender(df)

# Define the user's preferences
preferences = {
    'Likes': 'extreme_activities',
    'Weather': 'Snowy',
}

# Define the weights for each attribute
weights = {
    'Likes': 0.7,
    'Weather': 0.3,
}

# Get the recommendations
recommendations = recommender.recommend(preferences, weights)
print(recommendations.head(5))


1484           Archery
6298    Rock Climbing 
3354           Hiking 
284           Kayaking
7156       Ice Skating
Name: Activity, dtype: object


In [None]:
'''
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import OneHotEncoder

# Load dataset
df = pd.read_csv('dataset_activities.csv')

# New data
new_data = pd.DataFrame({
    'Name': ['Alice'],
    'Likes': ['outdoor_activities'],
    'Local': ['Campbellfort'],
    'Date': ['2023-09-28'],
    'Hour': ['10:00'],
    'Weather': ['Sunny']
})

# Define categorical features
categorical_features = ['Name', 'Likes', 'Local', 'Date', 'Hour', 'Weather']

# Apply OneHotEncoder to categorical features
one_hot = OneHotEncoder()
one_hot.fit(df[categorical_features])
df_encoded = one_hot.transform(df[categorical_features]).toarray()
new_data_encoded = one_hot.transform(new_data).toarray()

# Apply LabelEncoder to target variable 'Activity'
le = preprocessing.LabelEncoder()
df['Activity'] = le.fit_transform(df['Activity'])

# Split your data into features (X) and target (y)
X = df_encoded
y = df['Activity']

# Split your data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create your SVM
clf = svm.SVC()

# Train your SVM
clf.fit(X_train, y_train)


'''




"\nimport pandas as pd\nfrom sklearn import preprocessing\nfrom sklearn.model_selection import train_test_split\nfrom sklearn import svm\nfrom sklearn.preprocessing import OneHotEncoder\n\n# Load dataset\ndf = pd.read_csv('dataset_activities.csv')\n\n# New data\nnew_data = pd.DataFrame({\n    'Name': ['Alice'],\n    'Likes': ['outdoor_activities'],\n    'Local': ['Campbellfort'],\n    'Date': ['2023-09-28'],\n    'Hour': ['10:00'],\n    'Weather': ['Sunny']\n})\n\n# Define categorical features\ncategorical_features = ['Name', 'Likes', 'Local', 'Date', 'Hour', 'Weather']\n\n# Apply OneHotEncoder to categorical features\none_hot = OneHotEncoder()\none_hot.fit(df[categorical_features])\ndf_encoded = one_hot.transform(df[categorical_features]).toarray()\nnew_data_encoded = one_hot.transform(new_data).toarray()\n\n# Apply LabelEncoder to target variable 'Activity'\nle = preprocessing.LabelEncoder()\ndf['Activity'] = le.fit_transform(df['Activity'])\n\n# Split your data into features (X) and

In [None]:
'''from sklearn.metrics import accuracy_score, f1_score
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multi-class classification
print(f'F1 Score: {f1}')
# Print the predicted activity
predicted_activity = clf.predict(new_data_encoded)
print(predicted_activity)  # Inverse transform the predicted label
predicted_activity_name = le.inverse_transform(predicted_activity)

# Print the predicted activity
print(predicted_activity_name)
'''

"from sklearn.metrics import accuracy_score, f1_score\ny_pred = clf.predict(X_test)\n\naccuracy = accuracy_score(y_test, y_pred)\nprint(f'Accuracy: {accuracy}')\n\n# Calculate the F1 score\nf1 = f1_score(y_test, y_pred, average='weighted')  # Use 'weighted' for multi-class classification\nprint(f'F1 Score: {f1}')\n# Print the predicted activity\npredicted_activity = clf.predict(new_data_encoded)\nprint(predicted_activity)  # Inverse transform the predicted label\npredicted_activity_name = le.inverse_transform(predicted_activity)\n\n# Print the predicted activity\nprint(predicted_activity_name)\n"