In [3]:
# Imports

import pandas as pd
import numpy as np
import pandas.api.types as ptypes
import ast
import re

# EDA on Places

In [7]:
# Load places data from datasets

places_df = pd.read_excel("./data/places.xlsx")
places_df.head(10)

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,['Arugam Bay Beach is a surfer's paradise! I s...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,['Mirissa Beach is truly a gem on Sri LankaÃ¢Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,['Weligama Beach is a fantastic spot for both ...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,['Ahangama was a bit disappointing for me as a...
4,Hikkaduwa Beach,6.137727,80.09906,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,['Hikkaduwa Beach is a delightful escape for s...
5,Tangalle,6.024338,80.794073,"Tangalle, Sri Lanka",,,['Tangalle was a bit of a letdown for me. The ...
6,Unawatuna Beach,6.009686,80.248424,"Unawatuna Beach, Sri Lanka",4.8,1868.0,['Unawatuna Beach is a slice of paradise! The ...
7,Pigeon Island,8.721837,81.204071,"Pigeon Island, Sri Lanka",4.5,174.0,['Pigeon Island is a gem! Snorkeling here was ...
8,Galle Dutch Fort,6.030459,80.215021,"Galle 80000, Sri Lanka",4.6,16934.0,"[""Galle Dutch Fort is a stunning blend of hist..."
9,Polonnaruwa Ancient City,7.945942,81.000329,"Polonnaruwa, Sri Lanka",4.3,878.0,['Polonnaruwa Ancient City is a stunning place...


Let's select a random row in this dataframe to see what we're working with.

In [5]:
places_df.loc[119]["latest_reviews"]

"['Bakers Falls is a hidden gem! The sound of the water cascading down is incredibly soothing. We spent a lovely afternoon here, enjoying a picnic while soaking in the stunning views. ItÃ¢Â€Â™s a perfect spot for nature lovers and photographers alike. A must-visit if youÃ¢Â€Â™re in the area!', 'I visited Bakers Falls with friends and we had a great time! The scenery is lovely, but I was expecting a bit more from the actual falls. Still, itÃ¢Â€Â™s a nice place to relax and take some photos. Definitely worth a stop if youÃ¢Â€Â™re nearby.', 'As a couple, we found Bakers Falls to be a peaceful escape. The surrounding nature is beautiful, and itÃ¢Â€Â™s a great spot to unwind. While the falls arenÃ¢Â€Â™t the biggest, the overall atmosphere is serene and romantic. Bring a picnic and enjoy the day!', 'We brought our kids to Bakers Falls for a family outing, and they loved exploring the area! The falls are nice, but we found the picnic spots to be the highlight. It was a bit crowded, though, so

It appears that `latest_reviews` contains lists of strings, each containing the latest reviews for a place. However, this is still a string, and will have to be converted to a list.

In [32]:
# Check data type of reviews column is string

print(type(places_df["latest_reviews"][0]))

<class 'str'>


In [23]:
# Convert the 'latest_reviews' column from string representation to actual lists
places_df["latest_reviews"] = places_df["latest_reviews"].apply(ast.literal_eval)

places_df

SyntaxError: invalid character '¢' (U+00A2) (<unknown>, line 1)

We have to fix the encoding issue first.

In [34]:
# Function to find and print all unique words containing garbled sequences
def find_unique_words_with_garbled_text(df, column):
    unique_words = set()  # To store unique words

    # Define a pattern to match garbled sequences
    garbled_pattern = re.compile(r"[\w]*Ã¢[\w]*")  # Matches words containing 'Ã¢'

    for review in df[column]:
        # Extract words from the review
        words = re.findall(r"\b\w+\b", review)  # Matches individual words

        # Filter words containing garbled sequences
        words_with_garbled = [word for word in words if garbled_pattern.search(word)]

        # Add to the set of unique words
        unique_words.update(words_with_garbled)

    # Print the unique words containing garbled sequences
    if unique_words:
        print("Unique words containing garbled sequences:")
        for word in unique_words:
            print(word)
    else:
        print("No words with garbled sequences found.")


# Call the function with the DataFrame and the column name
find_unique_words_with_garbled_text(places_df, "latest_reviews")

No words with garbled sequences found.


In [8]:
# Inspect places dataset

places_df.describe()

Unnamed: 0,lat,lng,rating,user_ratings_total
count,410.0,410.0,355.0,355.0
mean,7.304668,80.583211,4.459437,1608.639437
std,0.994841,0.518571,0.433211,3607.678752
min,5.941381,79.694183,0.9,27.0
25%,6.629356,80.213274,4.3,147.0
50%,7.01278,80.536523,4.5,375.0
75%,7.94793,80.946773,4.75,1245.5
max,9.820859,81.859583,5.0,26736.0


In [9]:
places_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 411 entries, 0 to 410
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   name                411 non-null    object 
 1   lat                 410 non-null    float64
 2   lng                 410 non-null    float64
 3   formatted_address   411 non-null    object 
 4   rating              355 non-null    float64
 5   user_ratings_total  355 non-null    float64
 6   latest_reviews      411 non-null    object 
dtypes: float64(4), object(3)
memory usage: 22.6+ KB


Null data is available, so preprocessing needed to remove them or address them appropriately.

# EDA on Visitors

In [4]:
# Load places data from datasets

visitors_df = pd.read_excel("./data/visitors.xlsx")
visitors_df.head(10)

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,"['cycling', 'historical monuments', 'village h...","['Polonnaruwa', 'Hatton', 'Anuradhapura', 'Ell..."
1,2,Emily Perry,emily.perry@example.com,"['butterfly watching', 'hot springs', 'wildlif...","['Madunagala Hot Water Spring', 'Wilpattu Nati..."
2,3,Danielle Mcbride,danielle.mcbride@example.com,"['sea cruises', 'themed parks', 'craft worksho...","['Mirissa Beach', 'Negombo Lagoon', 'Batadomba..."
3,4,Angelica Wilson,angelica.wilson@example.com,"['fishing', 'hot springs', 'sailing']","['Maha Oya Hot Water Springs', 'Colombo Port C..."
4,5,Laurie Powers,laurie.powers@example.com,"['history tours', 'sailing', 'literary tours']","['Negombo Lagoon', 'Colombo Port City', 'Galle..."
5,6,Michelle Anderson,michelle.anderson@example.com,"['public art installations', 'temple pilgrimag...","['Colombo', 'Sigiriya', 'Mihintale', 'Galle Du..."
6,7,Louis Ramsey,louis.ramsey@example.com,"['fishing', 'golfing', 'historical monuments']","['Hikkaduwa', 'Kalpitiya', 'Polonnaruwa', 'Neg..."
7,8,Dominique Hammond,dominique.hammond@example.com,"['sailing', 'hot air ballooning', 'spiritual r...","['Trincomalee Harbour', 'Kandalama', ""Sri Pada..."
8,9,Tara Reilly,tara.reilly@example.com,"['cultural experiences', 'botanical gardens', ...","['Seethawaka Wet Zone Botanical Gardens', 'Sig..."
9,10,Stacy Anderson MD,stacy.md@example.com,"['boat safaris', 'sailing', 'caving']","['Batatotalena (Batadombalena) Cave', 'Colombo..."


In [38]:
print(type(visitors_df["Preferred Activities"][0]))
print(type(visitors_df["Bucket list destinations Sri Lanka"][0]))

<class 'str'>
<class 'str'>


In [14]:
visitors_df.describe()

Unnamed: 0,User ID
count,10000.0
mean,5000.5
std,2886.89568
min,1.0
25%,2500.75
50%,5000.5
75%,7500.25
max,10000.0


In [15]:
visitors_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 5 columns):
 #   Column                              Non-Null Count  Dtype 
---  ------                              --------------  ----- 
 0   User ID                             10000 non-null  int64 
 1   Name                                10000 non-null  object
 2   Email                               10000 non-null  object
 3   Preferred Activities                10000 non-null  object
 4   Bucket list destinations Sri Lanka  10000 non-null  object
dtypes: int64(1), object(4)
memory usage: 390.8+ KB


No null values present in this dataframe.

In [16]:
# Check data type of preferred activities column is string, not array

is_string = ptypes.is_string_dtype(visitors_df["Preferred Activities"])
print(is_string)

True


# Random Preprocessing

In [8]:
# Convert pseudolists to strings (cleaning step)
places_df["latest_reviews"] = places_df["latest_reviews"].apply(
    lambda x: x.strip("[]").replace("', '", " ").replace("'", "")
)

places_df

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,Arugam Bay Beach is a surfers paradise! I spen...
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,Mirissa Beach is truly a gem on Sri LankaÃ¢Â€Â...
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,Weligama Beach is a fantastic spot for both be...
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,Ahangama was a bit disappointing for me as a s...
4,Hikkaduwa Beach,6.137727,80.099060,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,Hikkaduwa Beach is a delightful escape for sol...
...,...,...,...,...,...,...,...
406,Uppuveli Beach,8.607956,81.220013,"Trincomalee, Sri Lanka",4.3,399.0,Uppuveli Beach is a stunning escape! The soft ...
407,Koggala Beach,5.992272,80.310691,"Koggala Beach, Sri Lanka",4.3,353.0,Koggala Beach is a hidden gem! The soft sand a...
408,Marakolliya Beach,6.042222,80.823073,"Kapuhenwala Road, Sri Lanka",4.3,180.0,Marakolliya Beach is a hidden gem! The waves w...
409,Pasikuda Beach,7.929994,81.561185,"Pasikuda Beach, Sri Lanka",4.4,1142.0,Pasikuda Beach is a hidden gem! The pristine w...


In [9]:
# Concatenate relevant columns to create a single input text
places_df["text"] = places_df.apply(
    lambda x: f"{x['name']} {x['formatted_address']} {x['latest_reviews']}", axis=1
)

places_df

Unnamed: 0,name,lat,lng,formatted_address,rating,user_ratings_total,latest_reviews,text
0,Arugam Bay Beach,6.840408,81.836848,"Arugam Bay Beach, Sri Lanka",4.8,1591.0,Arugam Bay Beach is a surfers paradise! I spen...,"Arugam Bay Beach Arugam Bay Beach, Sri Lanka A..."
1,Mirissa Beach,5.944703,80.459161,"Mirissa, Sri Lanka",4.6,1748.0,Mirissa Beach is truly a gem on Sri LankaÃ¢Â€Â...,"Mirissa Beach Mirissa, Sri Lanka Mirissa Beach..."
2,Weligama Beach (surf and stay),5.972486,80.435714,"Weligama, Sri Lanka",4.4,325.0,Weligama Beach is a fantastic spot for both be...,"Weligama Beach (surf and stay) Weligama, Sri L..."
3,Ahangama,5.973975,80.362159,"Ahangama, Sri Lanka",,,Ahangama was a bit disappointing for me as a s...,"Ahangama Ahangama, Sri Lanka Ahangama was a bi..."
4,Hikkaduwa Beach,6.137727,80.099060,"Hikkaduwa Beach, Sri Lanka",4.7,1438.0,Hikkaduwa Beach is a delightful escape for sol...,"Hikkaduwa Beach Hikkaduwa Beach, Sri Lanka Hik..."
...,...,...,...,...,...,...,...,...
406,Uppuveli Beach,8.607956,81.220013,"Trincomalee, Sri Lanka",4.3,399.0,Uppuveli Beach is a stunning escape! The soft ...,"Uppuveli Beach Trincomalee, Sri Lanka Uppuveli..."
407,Koggala Beach,5.992272,80.310691,"Koggala Beach, Sri Lanka",4.3,353.0,Koggala Beach is a hidden gem! The soft sand a...,"Koggala Beach Koggala Beach, Sri Lanka Koggala..."
408,Marakolliya Beach,6.042222,80.823073,"Kapuhenwala Road, Sri Lanka",4.3,180.0,Marakolliya Beach is a hidden gem! The waves w...,"Marakolliya Beach Kapuhenwala Road, Sri Lanka ..."
409,Pasikuda Beach,7.929994,81.561185,"Pasikuda Beach, Sri Lanka",4.4,1142.0,Pasikuda Beach is a hidden gem! The pristine w...,"Pasikuda Beach Pasikuda Beach, Sri Lanka Pasik..."


In [10]:
places_df["text"][0]

'Arugam Bay Beach Arugam Bay Beach, Sri Lanka Arugam Bay Beach is a surfers paradise! I spent incredible days riding the waves, and the local surf schools were fantastic for beginners like me. The atmosphere is laid-back, with friendly locals and fellow travelers. After a long day of surfing, the sunsets were simply magical. The beach is a bit crowded, especially during peak season, but it adds to the lively vibe. I canÃ¢Â€Â™t wait to return! My friends and I had an unforgettable time at Arugam Bay Beach! The surfing conditions were excellent, and we all managed to catch some great waves. The beach is beautiful, with soft sand and clear waters perfect for swimming. However, we noticed some litter on the beach, which was a bit disappointing. Overall, the vibrant nightlife and delicious food made up for it. Definitely worth a visit! As a couple looking for relaxation, Arugam Bay Beach offered a perfect blend of tranquility and excitement. We enjoyed lazy days lounging on the beach and in

In [5]:
visitors_df["Preferred Activities"] = visitors_df["Preferred Activities"].apply(
    lambda x: x.strip("[]").replace("', '", " ").replace("'", "")
)
visitors_df["Bucket list destinations Sri Lanka"] = visitors_df[
    "Bucket list destinations Sri Lanka"
].apply(lambda x: x.strip("[]").replace("', '", " ").replace("'", ""))

visitors_df

Unnamed: 0,User ID,Name,Email,Preferred Activities,Bucket list destinations Sri Lanka
0,1,Jennifer Quinn,jennifer.quinn@example.com,cycling historical monuments village homestays,Polonnaruwa Hatton Anuradhapura Ella Haputale
1,2,Emily Perry,emily.perry@example.com,butterfly watching hot springs wildlife viewing,Madunagala Hot Water Spring Wilpattu National ...
2,3,Danielle Mcbride,danielle.mcbride@example.com,sea cruises themed parks craft workshops,Mirissa Beach Negombo Lagoon Batadombalena Cra...
3,4,Angelica Wilson,angelica.wilson@example.com,fishing hot springs sailing,Maha Oya Hot Water Springs Colombo Port City N...
4,5,Laurie Powers,laurie.powers@example.com,history tours sailing literary tours,Negombo Lagoon Colombo Port City Galle Dutch F...
...,...,...,...,...,...
9995,9996,Jonathan Hernandez,jonathan.hernandez@example.com,paddleboarding river cruises kayaking,Ahungalla Bolgoda Lake Unawatuna Beach Colombo...
9996,9997,Cody Gallegos,cody.gallegos@example.com,theater scuba diving yoga retreats,Kalpitiya Hikkaduwa Coral Sanctuary Trincomale...
9997,9998,Amy House,amy.house@example.com,sea cruises zip-lining outdoor adventures,Hikkaduwa Coral Sanctuary Ella Pigeon Island N...
9998,9999,Leslie Aguilar,leslie.aguilar@example.com,cycling amusement parks paddleboarding,Ella Hatton Negambo Colombo Port City Leisure ...
