In [1]:
!pip install pandas scikit-learn numpy matplotlib seaborn



In [2]:
import pandas as pd

In [3]:
# Load datasets
places_df = pd.read_excel("D:/Projects/rootcode_datathon/Places Dataset.xlsx")
visitors_df = pd.read_excel("D:/Projects/rootcode_datathon/Visitors Preference Dataset.xlsx")

In [4]:
# Display the first few rows of the places dataset
print("Places Dataset:")
print(places_df.head())

# Display the first few rows of the visitors dataset
print("Visitors Dataset:")
print(visitors_df.head())

Places Dataset:
                             name       lat        lng  \
0                Arugam Bay Beach  6.840408  81.836848   
1                   Mirissa Beach  5.944703  80.459161   
2  Weligama Beach (surf and stay)  5.972486  80.435714   
3                        Ahangama  5.973975  80.362159   
4                 Hikkaduwa Beach  6.137727  80.099060   

             formatted_address  rating  user_ratings_total  \
0  Arugam Bay Beach, Sri Lanka     4.8              1591.0   
1           Mirissa, Sri Lanka     4.6              1748.0   
2          Weligama, Sri Lanka     4.4               325.0   
3          Ahangama, Sri Lanka     NaN                 NaN   
4   Hikkaduwa Beach, Sri Lanka     4.7              1438.0   

                                      latest_reviews  
0  ['Arugam Bay Beach is a surfer's paradise! I s...  
1  ['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...  
2  ['Weligama Beach is a fantastic spot for both ...  
3  ['Ahangama was a bit disappointing for 

In [5]:
# Check for missing values in the places dataset
print("Places Dataset Missing Values:")
print(places_df.isnull().sum())

# Check for missing values in the visitors dataset
print("Visitors Dataset Missing Values:")
print(visitors_df.isnull().sum())

Places Dataset Missing Values:
name                   0
lat                    1
lng                    1
formatted_address      0
rating                56
user_ratings_total    56
latest_reviews         0
dtype: int64
Visitors Dataset Missing Values:
User ID                               0
Name                                  0
Email                                 0
Preferred Activities                  0
Bucket list destinations Sri Lanka    0
dtype: int64


In [6]:
# Check data types of columns
print("Places Dataset Data Types:")
print(places_df.dtypes)

print("Visitors Dataset Data Types:")
print(visitors_df.dtypes)

Places Dataset Data Types:
name                   object
lat                   float64
lng                   float64
formatted_address      object
rating                float64
user_ratings_total    float64
latest_reviews         object
dtype: object
Visitors Dataset Data Types:
User ID                                int64
Name                                  object
Email                                 object
Preferred Activities                  object
Bucket list destinations Sri Lanka    object
dtype: object


In [7]:
# Fill missing values in the rating column with the average rating
avg_rating = places_df['rating'].mean()
places_df['rating'].fillna(avg_rating, inplace=True)

# Fill missing values in the user_ratings_total column with 0
places_df['user_ratings_total'].fillna(0, inplace=True)

In [8]:
# Drop rows with any missing values in the visitors dataset
visitors_df.dropna(inplace=True)

In [9]:
import ast

# Convert string representations of lists to actual lists
visitors_df['Preferred Activities'] = visitors_df['Preferred Activities'].apply(lambda x: ast.literal_eval(x))
visitors_df['Bucket list destinations Sri Lanka'] = visitors_df['Bucket list destinations Sri Lanka'].apply(lambda x: ast.literal_eval(x))

In [10]:
# Select relevant features from places_df
places_features = places_df[['name', 'lat', 'lng', 'rating', 'user_ratings_total']]

In [11]:
# Select relevant features from visitors_df
visitors_features = visitors_df[['Preferred Activities', 'Bucket list destinations Sri Lanka']]

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize MultiLabelBinarizer
mlb_activities = MultiLabelBinarizer()
mlb_destinations = MultiLabelBinarizer()

# Fit and transform the preferred activities
activities_encoded = mlb_activities.fit_transform(visitors_features['Preferred Activities'])

# Fit and transform the bucket list destinations
destinations_encoded = mlb_destinations.fit_transform(visitors_features['Bucket list destinations Sri Lanka'])

# Convert encoded arrays to DataFrames
activities_df = pd.DataFrame(activities_encoded, columns=mlb_activities.classes_)
destinations_df = pd.DataFrame(destinations_encoded, columns=mlb_destinations.classes_)

In [13]:
# Concatenate original visitors features with encoded features
visitors_prepared_df = pd.concat([visitors_df[['User ID']], activities_df, destinations_df], axis=1)

In [14]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Normalize rating and user_ratings_total columns
places_normalized = places_features.copy()
places_normalized[['rating', 'user_ratings_total']] = scaler.fit_transform(places_normalized[['rating', 'user_ratings_total']])

In [15]:
import numpy as np

# Assuming visitors_prepared_df and places_normalized are already prepared

# Create a matrix where rows are users and columns are places
def create_user_item_matrix(visitors_df, places_df):
    # Initialize matrix with zeros
    matrix = np.zeros((len(visitors_df), len(places_df)))
    
    for i, user in visitors_df.iterrows():
        user_id = user['User ID']
        preferred_destinations = user['Bucket list destinations Sri Lanka']
        
        for j, place in places_df.iterrows():
            place_name = place['name']
            
            if place_name in preferred_destinations:
                matrix[i, j] = 1  # Mark as preferred
    
    return matrix

user_item_matrix = create_user_item_matrix(visitors_df, places_df)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute similarity matrix between places
similarity_matrix = cosine_similarity(places_normalized[['rating', 'user_ratings_total']])

# Define a function to recommend places for a given user
def recommend_places(user_index, user_item_matrix, similarity_matrix, top_n=5):
    # Get the user's ratings/preferences
    user_preferences = user_item_matrix[user_index]
    
    # Compute the weighted sum of similarities based on user preferences
    scores = similarity_matrix.dot(user_preferences)
    
    # Get indices of top_n places
    top_indices = np.argsort(scores)[::-1][:top_n]
    
    return top_indices

# Example: Recommend places for the first user
recommended_indices = recommend_places(0, user_item_matrix, similarity_matrix)
recommended_places = places_df.iloc[recommended_indices]

In [17]:
# Define a simple relevance metric (precision at k)
def evaluate_relevance(user_index, recommended_indices, top_n=5):
    user_preferences = set(visitors_df.loc[user_index, 'Bucket list destinations Sri Lanka'])
    recommended_places = set(places_df.loc[recommended_indices, 'name'])
    relevant_items = len(user_preferences.intersection(recommended_places))
    return relevant_items / top_n

# Evaluate relevance for the first user
relevance_score = evaluate_relevance(0, recommended_indices)

In [18]:
import joblib

# Save the similarity matrix and user-item matrix
joblib.dump(similarity_matrix, 'similarity_matrix.pkl')
joblib.dump(user_item_matrix, 'user_item_matrix.pkl')

['user_item_matrix.pkl']

In [19]:
mkdir TeamName_Datathon
cp similarity_matrix.pkl user_item_matrix.pkl TeamName_Datathon/
cp *.ipynb TeamName_Datathon/  # Copy all Jupyter notebooks

SyntaxError: invalid syntax (2678985230.py, line 1)

In [20]:
!mkdir TeamName_Datathon

In [21]:
!cp similarity_matrix.pkl user_item_matrix.pkl TeamName_Datathon/
!cp *.ipynb TeamName_Datathon/

'cp' is not recognized as an internal or external command,
operable program or batch file.
'cp' is not recognized as an internal or external command,
operable program or batch file.


In [22]:
import joblib

# Save the similarity matrix and user-item matrix
joblib.dump(similarity_matrix, 'similarity_matrix.pkl')
joblib.dump(user_item_matrix, 'user_item_matrix.pkl')

['user_item_matrix.pkl']

In [1]:
!mkdir TeamName_Datathon
!copy similarity_matrix.pkl user_item_matrix.pkl TeamName_Datathon/
!copy *.ipynb TeamName_Datathon/

A subdirectory or file TeamName_Datathon already exists.


The syntax of the command is incorrect.
The syntax of the command is incorrect.


In [2]:
!mkdir TeamName_Datathon
!cp similarity_matrix.pkl user_item_matrix.pkl TeamName_Datathon/
!cp *.ipynb TeamName_Datathon/

A subdirectory or file TeamName_Datathon already exists.
'cp' is not recognized as an internal or external command,
operable program or batch file.
'cp' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
pkl_files = ["similarity_matrix.pkl"]  # or ["user_item_matrix.pkl"]

In [4]:
from IPython.display import FileLink, display

# List of .pkl files to download
pkl_files = ["similarity_matrix.pkl", "user_item_matrix.pkl"]

# Create download links for each file
for file in pkl_files:
    display(FileLink(file, result_html_prefix=f"Click to download {file}: "))