In [1]:
import pandas as pd

Imagine you work for an e-commerce company that has collected customer reviews
for its products. The data is stored in a list of dictionaries, where each dictionary
represents a customer review with the following structure

customer reviews = [
{'product_id':101, 'review': 'This product is amazing! I love it.'},
{'product_id':102, 'review': 'The quality is not as expected. Disappointed.'},
{'product_id':103, 'review': 'Great value for the price. Highly recommended.'},
{'product_id':104, 'review': 'Not happy with the purchase. Will return it.'},
{'product_id':105, 'review': 'Excellent service. Fast delivery.'},


1. Identify and count the number of reviews that express a positive sentiment.
2. Identify and list the product IDs and reviews for the products with negative
sentiments.
3. Calculate the average length (number of words) of the reviews
4. Determine the product that is mentioned the most in the reviews.
5. Create a distribution of sentiments (positive, negative, neutral) for the
reviews.
6. Extract key words or phrases from the reviews that frequently appear.
7. Find and display the longest review along with its product ID.
8. Identify products that have both positive and negative reviews.
9. Change the sentiment of a selected review from positive to negative andvice versa.
10. Compare the similarity between two selected reviews without using any
similarity metrics directly.

In [2]:
customer_reviews = [
    {'product_id':101, 'review': 'This product is amazing! I love it.'},
    {'product_id':102, 'review': 'The quality is not as expected. Disappointed.'},
    {'product_id':103, 'review': 'Great value for the price. Highly recommended.'},
    {'product_id':104, 'review': 'Not happy with the purchase. Will return it.'},
    {'product_id':105, 'review': 'Excellent service. Fast delivery.'}
]

df = pd.DataFrame(customer_reviews)


In [3]:
df

Unnamed: 0,product_id,review
0,101,This product is amazing! I love it.
1,102,The quality is not as expected. Disappointed.
2,103,Great value for the price. Highly recommended.
3,104,Not happy with the purchase. Will return it.
4,105,Excellent service. Fast delivery.


In [4]:
df['review'] = df['review'].str.lower()

In [5]:
df

Unnamed: 0,product_id,review
0,101,this product is amazing! i love it.
1,102,the quality is not as expected. disappointed.
2,103,great value for the price. highly recommended.
3,104,not happy with the purchase. will return it.
4,105,excellent service. fast delivery.


In [6]:
# 1. Identify and count the number of reviews that express a positive sentiment.

pos_rev_count = df[df['review'].str.contains('amazing|love|great|highly recommended|excellent')].shape[0]
print("Number of positive reviews:", pos_rev_count)

Number of positive reviews: 3


In [7]:
# 2. Identify and list the product IDs and reviews for the products with negative sentiments.
neg_rev = df[df['review'].str.contains('not|disappointed|unhappy|Will return')][['product_id', 'review']]
neg_rev

Unnamed: 0,product_id,review
1,102,the quality is not as expected. disappointed.
3,104,not happy with the purchase. will return it.


In [8]:
# 3. Calculate the average length (number of words) of the reviews
df['review_length'] = df['review'].apply(lambda x: len(x.split()))
avg_rev_len = df['review_length'].mean()
avg_rev_len


6.6

In [9]:
# 4. Determine the product that is mentioned the most in the reviews.
most_mentioned_pro = df['product_id'].mode().iloc[0]
most_mentioned_pro

101

In [10]:
# 5. Create a distribution of sentiments (positive, negative, neutral) for the reviews.

positive_words = ['amazing', 'love', 'recommended', 'excellent']
negative_words = ['disappointed', 'not happy', 'return']

def contains_positive_words(review):
    # Convert the review to lowercase for case-insensitive matching
    review_lower = review.lower()
    
    # Check if any positive word is present in the review
    for word in positive_words:
        if word in review_lower:
            return True

    return False

def contains_negative_words(review):
    # Convert the review to lowercase for case-insensitive matching
    review_lower = review.lower()
    
    # Check if any negative word is present in the review
    for word in negative_words:
        if word in review_lower:
            return True

    return False

def categorize_sentiment(review):
    # Categorize sentiment based on positive and negative word presence
    if contains_positive_words(review):
        return 'positive'
    elif contains_negative_words(review):
        return 'negative'
    else:
        return 'neutral'


# Apply categorization function to reviews
df['sentiment'] = df['review'].apply(categorize_sentiment)

# Display distribution of sentiments
sentiment_distribution = df['sentiment'].value_counts()

sentiment_distribution

positive    3
negative    2
Name: sentiment, dtype: int64

In [11]:
# 6. Extract key words or phrases from the reviews that frequently appear.
from collections import Counter
import re

all_reviews = ' '.join(df['review'].tolist())
words = re.findall(r'\w+', all_reviews.lower())
word_counts = Counter(words)

freq_words = word_counts.most_common(3)
freq_words

[('the', 3), ('is', 2), ('it', 2)]

In [12]:
# 7. Find and display the longest review along with its product ID.
longest_review = df.loc[df['review_length'].idxmax()]
print("Longest review:\n", longest_review['review'])
print("Product ID:", longest_review['product_id'])


Longest review:
 not happy with the purchase. will return it.
Product ID: 104


In [13]:
# 8. Identify products that have both positive and negative reviews.
both_sent = df.groupby('product_id').filter(lambda x: any(x['review'].str.contains('amazing|love|great|highly recommended|excellent')) and any(x['review'].str.contains('not|disappointed|unhappy|will return')))
print(both_sent['product_id'].unique())

[]


Let's consider a scenario related to real estate data. Suppose you have a dataset containing information about real estate properties, and the data is stored in a list of dictionaries. Each dictionary represents a property with details such as the property type, size, location, number of bedrooms, and price. Here's the scenario and some
questions:

real estate data = [
{'product_id': 1, 'property_type': 'Apartment', 'size_sqft': 1200, 'location':'Downtown', 'bedrooms': 2, 'price_usd': 250000},
{'product_id': 2, 'property_type': 'House', 'size_sqft': 2000, 'location':'Suburb', 'bedrooms': 3, 'price_usd': 350000},
{'product_id': 3, 'property_type': 'Apartment', 'size_sqft': 800, 'location':'Uptown', 'bedrooms': 1, 'price_usd': 150000},
{'product_id': 4, 'property_type': 'Condo', 'size_sqft': 1500, 'location':'Downtown', 'bedrooms': 2, 'price_usd': 300000},
{'product_id': 5, 'property_type': 'House', 'size_sqft': 1800, 'location':'Suburb', 'bedrooms': 4, 'price_usd': 400000}
]


Question:

1. Calculate the average size of properties in the dataset
2. Identify and list properties located in the downtown area.
3. Find properties with a price higher than $300,000.
4. Determine the distribution of property types in the dataset.
5. Identify and list apartments with a price less than $200,000.
6. Calculate the average price per square foot for all properties.
7. Identify and list houses with a size greater than 1,800 sqft.
8. Determine the distribution of the number of bedrooms in the dataset.
9. Find properties with 3 bedrooms and a price less than $300,000.
10. Categorize properties into size ranges (e.g., Small, Medium, Large) based on their square footage.

In [14]:
real_estate_data = [
    {'product_id': 1, 'property_type': 'Apartment', 'size_sqft': 1200, 'location':'Downtown', 'bedrooms': 2, 'price_usd': 250000},
    {'product_id': 2, 'property_type': 'House', 'size_sqft': 2000, 'location':'Suburb', 'bedrooms': 3, 'price_usd': 350000},
    {'product_id': 3, 'property_type': 'Apartment', 'size_sqft': 800, 'location':'Uptown', 'bedrooms': 1, 'price_usd': 150000},
    {'product_id': 4, 'property_type': 'Condo', 'size_sqft': 1500, 'location':'Downtown', 'bedrooms': 2, 'price_usd': 300000},
    {'product_id': 5, 'property_type': 'House', 'size_sqft': 1800, 'location':'Suburb', 'bedrooms': 4, 'price_usd': 400000}
]

re_df = pd.DataFrame(real_estate_data)


In [15]:
re_df

Unnamed: 0,product_id,property_type,size_sqft,location,bedrooms,price_usd
0,1,Apartment,1200,Downtown,2,250000
1,2,House,2000,Suburb,3,350000
2,3,Apartment,800,Uptown,1,150000
3,4,Condo,1500,Downtown,2,300000
4,5,House,1800,Suburb,4,400000


In [16]:
# 1. Calculate the average size of properties in the dataset
avg_size = re_df['size_sqft'].mean()
avg_size

1460.0

In [17]:
# 2. Identify and list properties located in the downtown area.
downtown_area = re_df[re_df['location'] == 'Downtown']
downtown_area

Unnamed: 0,product_id,property_type,size_sqft,location,bedrooms,price_usd
0,1,Apartment,1200,Downtown,2,250000
3,4,Condo,1500,Downtown,2,300000


In [18]:
# 3. Find properties with a price higher than $300,000.
high_price_properties = re_df[re_df['price_usd'] > 300000]
high_price_properties

Unnamed: 0,product_id,property_type,size_sqft,location,bedrooms,price_usd
1,2,House,2000,Suburb,3,350000
4,5,House,1800,Suburb,4,400000


In [19]:
# 4. Determine the distribution of property types in the dataset.
property_type_distribution = re_df['property_type'].value_counts()
property_type_distribution

Apartment    2
House        2
Condo        1
Name: property_type, dtype: int64

In [20]:
# 5. Identify and list apartments with a price less than $200,000.
cheap_apartments = re_df[(re_df['property_type'] == 'Apartment') & (re_df['price_usd'] < 200000)]
cheap_apartments


Unnamed: 0,product_id,property_type,size_sqft,location,bedrooms,price_usd
2,3,Apartment,800,Uptown,1,150000


In [21]:
# 6. Calculate the average price per square foot for all properties.
re_df['price_per_sqft'] = re_df['price_usd'] / re_df['size_sqft']
avg_price_sqft = re_df['price_per_sqft'].mean()
avg_price_sqft


198.61111111111114

In [22]:
# 7. Identify and list houses with a size greater than 1,800 sqft.
large_size = re_df[(re_df['property_type'] == 'House') & (re_df['size_sqft'] > 1800)]
large_size

Unnamed: 0,product_id,property_type,size_sqft,location,bedrooms,price_usd,price_per_sqft
1,2,House,2000,Suburb,3,350000,175.0


In [23]:
# 8. Determine the distribution of the number of bedrooms in the dataset.
bedroom_dist = re_df['bedrooms'].value_counts()
bedroom_dist

2    2
3    1
1    1
4    1
Name: bedrooms, dtype: int64

In [24]:
# 9. Find properties with 3 bedrooms and a price less than $300,000.
bed_3 = re_df[(re_df['bedrooms'] == 3) & (re_df['price_usd'] < 300000)]
bed_3

Unnamed: 0,product_id,property_type,size_sqft,location,bedrooms,price_usd,price_per_sqft


In [25]:
# 10. Categorize properties into size ranges (e.g., Small, Medium, Large) based on their square footage.
def categorize_size(size):
    if size < 1000:
        return 'Small'
    elif size >= 1000 and size < 2000:
        return 'Medium'
    else:
        return 'Large'

re_df['size_category'] = re_df['size_sqft'].apply(categorize_size)
re_df[['product_id', 'size_sqft', 'size_category']]


Unnamed: 0,product_id,size_sqft,size_category
0,1,1200,Medium
1,2,2000,Large
2,3,800,Small
3,4,1500,Medium
4,5,1800,Medium
