In [1]:
import numpy as np
import pandas as pd
from skimpy import skim
from scipy.stats import mode

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"C:\Users\bbhav\Level 1.ipynb.csv")
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


In [3]:
skim(df)

# Task 1 : Restaurant Reviews

> ### Analyze the text reviews to identify the most common positive and negative keywords.

In [4]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from collections import Counter

In [5]:
# Assuming 'text_reviews' is the column containing text reviews
reviews = df['Rating text'].dropna()

# Tokenize the words
all_words = [word.lower() for review in reviews for word in word_tokenize(str(review))]

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in all_words if word.isalnum() and word not in stop_words]

# Analyze sentiment to identify positive and negative words
sia = SentimentIntensityAnalyzer()
positive_words = [word for word in filtered_words if sia.polarity_scores(word)['compound'] > 0.5]
negative_words = [word for word in filtered_words if sia.polarity_scores(word)['compound'] < -0.5]

# Find the most common positive and negative words
most_common_positive = Counter(positive_words).most_common(10)
most_common_negative = Counter(negative_words).most_common(10)

# Display the results
print("Most Common Positive Words:", most_common_positive)
print("Most Common Negative Words:", most_common_negative)


Most Common Positive Words: [('excellent', 300)]
Most Common Negative Words: []


> ### Calculate the average length of reviews and explore if there is a relationship between review length and rating.

In [6]:
# Assuming 'text_reviews' is the column containing text reviews and 'rating' is the rating column
reviews = df['Rating text'].dropna()
ratings = df['Aggregate rating']

# Calculate the length of each review
review_lengths = reviews.apply(lambda x: len(str(x)))

# Add the review lengths as a new column in the DataFrame
df['review_length'] = review_lengths

# Calculate the average review length
average_review_length = review_lengths.mean()
print(f"Average Review Length: {average_review_length:.2f} characters")

# Explore the relationship between review length and rating
relationship_df = df[['Aggregate rating', 'review_length']]
correlation = relationship_df.corr().iloc[0, 1]

print(f"Correlation between Rating and Review Length: {correlation:.2f}")


Average Review Length: 7.02 characters
Correlation between Rating and Review Length: -0.48


# Task 2 : Votes Analysis

>### Identify the restaurants with the highest and lowest number of votes.

In [7]:
# Find the restaurant with the highest number of votes
restaurant_highest_votes = df.loc[df['Votes'].idxmax()]

# Find the restaurant with the lowest number of votes
restaurant_lowest_votes = df.loc[df['Votes'].idxmin()]

# Display the results
print("Restaurant with the Highest Number of Votes:")
print(restaurant_highest_votes[['Restaurant Name', 'Votes']])

print("\nRestaurant with the Lowest Number of Votes:")
print(restaurant_lowest_votes[['Restaurant Name', 'Votes']])


Restaurant with the Highest Number of Votes:
Restaurant Name     Toit
Votes              10934
Name: 719, dtype: object

Restaurant with the Lowest Number of Votes:
Restaurant Name    Cantinho da Gula
Votes                             0
Name: 69, dtype: object


> ### Analyze if there is a correlation between the number of votes and the rating of a restaurant.

In [8]:
correlation = df['Votes'].corr(df['Aggregate rating'])

# Display the result
print(f"Correlation between Votes and Rating: {correlation:.2f}")

Correlation between Votes and Rating: 0.31


# Task 3 : Price Range vs. Online Delivery and Table Booking

>### Analyze if there is a relationship between the price range and the availability of online delivery and table booking.

In [9]:
cross_tab = pd.crosstab(index=df['Price range'], columns=[df['Has Online delivery'], df['Has Table booking']])

# Display the cross-tabulation table
print("Cross-Tabulation: Price Range vs. Online Delivery and Table Booking")
cross_tab

Cross-Tabulation: Price Range vs. Online Delivery and Table Booking


Has Online delivery,No,No,Yes,Yes
Has Table booking,No,Yes,No,Yes
Price range,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,3737,0,700,1
2,1711,116,1163,123
3,621,373,140,271
4,299,234,13,40


>### Determine if higher-priced restaurants are more likely to offer these services.

In [11]:
# Convert 'Online Delivery' and 'Table Booking' columns to numeric
df['Has Online delivery'] = pd.to_numeric(df['Has Online delivery'], errors='coerce')
df['Has Table booking'] = pd.to_numeric(df['Has Table booking'], errors='coerce')

# Create a DataFrame with the counts of online delivery and table booking for each price range
service_counts = pd.DataFrame({
    'Total Restaurants': df.groupby('Price range').size(),
    'Online Delivery': df.groupby('Price range')['Has Online delivery'].sum(),
    'Table Booking': df.groupby('Price range')['Has Table booking'].sum()
})

# Calculate the proportions
service_counts['Online Delivery Proportion'] = service_counts['Online Delivery'] / service_counts['Total Restaurants']
service_counts['Table Booking Proportion'] = service_counts['Table Booking'] / service_counts['Total Restaurants']

# Display the results
print("Proportion of Restaurants Offering Online Delivery and Table Booking by Price Range:")
service_counts

Proportion of Restaurants Offering Online Delivery and Table Booking by Price Range:


Unnamed: 0_level_0,Total Restaurants,Online Delivery,Table Booking,Online Delivery Proportion,Table Booking Proportion
Price range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,4438,0.0,0.0,0.0,0.0
2,3113,0.0,0.0,0.0,0.0
3,1405,0.0,0.0,0.0,0.0
4,586,0.0,0.0,0.0,0.0
