In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id='1'></a>
# 1. Important Libraries and Config

In [None]:
%%capture
!pip install langdetect # Language Detection
!pip install bnlp_toolkit # For Bangla Word Cloud
!wget https://www.omicronlab.com/download/fonts/kalpurush.ttf # Bangla Font For the Word Cloud

In [None]:
from IPython.display import Markdown, display

import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
from wordcloud import WordCloud
import re

def printmd(string):
  display(Markdown(string))

from langdetect import detect
import unicodedata
import html

import folium
# Import folium MarkerCluster plugin
from folium.plugins import MarkerCluster
# Import folium MousePosition plugin
from folium.plugins import MousePosition
# Import folium DivIcon plugin
from folium.features import DivIcon

import warnings
warnings. filterwarnings('ignore')

<a id='2.1'></a>
## 2.1 Preview of Data

In [None]:
restaurant_df = pd.read_csv("../input/restaurants-around-bangladesh/restaurants.csv", encoding='utf-8')
restaurant_df

<a id='2.2'></a>
## 2.2 Summary of Data

In [None]:
restaurant_df.info()

In [None]:
restaurant_df.describe().T

<a id='2.3'></a>
## 2.3 Check for Duplicate Values

In [None]:
dup = restaurant_df.duplicated().sum()
printmd(f"### There are {dup} duplicated rows present")

In [None]:
printmd("### Removed the Duplicated Rows ")
restaurant_df.drop_duplicates(keep="first", inplace=True)
restaurant_df.head(10)

<a id='2.4'></a>
## 2.4 Check for Missing Values

In [None]:
def missing_value_describe(data):
    # check missing values in the data
    total = data.isna().sum().sort_values(ascending=False)
    missing_value_pct_stats = (data.isnull().sum() / len(data)*100)
    missing_value_col_count = sum(missing_value_pct_stats > 0)

    # missing_value_stats = missing_value_pct_stats.sort_values(ascending=False)[:missing_value_col_count]
    missing_data = pd.concat([total, missing_value_pct_stats], axis=1, keys=['Total', 'Percent'])
    
    rows = data.isna().any(axis = 1).sum()
    cols = missing_value_col_count
    printmd(f"#### Number of rows with at least 1 missing values: {rows}")
    printmd(f"#### Number of columns with missing values: {cols}")

    if missing_value_col_count != 0:
        # print out column names with missing value percentage
        printmd("##### Missing percentage (desceding):")
        display(missing_data[:missing_value_col_count])

        # plot missing values
        missing = data.isnull().sum()
        missing = missing[missing > 0]
        missing.sort_values(inplace=True)
        missing.plot.bar()
    else:
        print("No missing data!!!")

# pass a dataframe to the function
missing_value_describe(restaurant_df)

<a id='3'></a>
# 3. Data Preparation
---

<a id='3.1'></a>
## 3.1 Check Location

### Lets look into the addresses of the restaurants to check whether they are inside Bangladesh or not.

In [None]:
restaurant_df[restaurant_df['address'].str.contains('Bangladesh')==False]

### Looks like there are some restaurants that are in India instead of Bangladesh. We shall remove such restaurants and prepare a new dataframe for further investigations

In [None]:
bd_rest_df = restaurant_df[restaurant_df['address'].str.contains('Bangladesh')==True]
bd_rest_df.reset_index(drop=True, inplace=True)
bd_rest_df

<a id='3.2'></a>
## 3.2 Convert Affluence Level

### For a better visualization, we will convert the afffluence levels from 1.0, 2.0, 3.0... to ``$, $$, $$$``

In [None]:
bd_rest_df['affluence'] = bd_rest_df['affluence'].replace([1.0, 2.0, 3.0, 4.0],['$', '$$', '$$$', '$$$$'])
bd_rest_df[bd_rest_df['affluence'].notna()==True]

<a id='4'></a>
# 4. Visualizations
---

<a id='4.1'></a>
## 4.1 Popular Restaurant Names
### Looks like some of the names are in Bangla. Lets separate the restaurants' that have their names in Bangla.

In [None]:
reg = re.compile(r'[a-zA-Z]')

bd_rest_df["name_type"] = bd_rest_df["name"].apply(lambda x: "English" if reg.match(x) else "Bangla")

en_bd_restaurant = bd_rest_df[bd_rest_df['name_type'] == "English"]
non_en_bd_restaurant = bd_rest_df[bd_rest_df['name_type'] == "Bangla"]    

printmd("### Restaurants With English Name")
display(en_bd_restaurant)
printmd("### Restaurants With Bangla Name")
display(non_en_bd_restaurant)

In [None]:
data = en_bd_restaurant.name.value_counts().to_dict()

wc = WordCloud(width=800, height=400,background_color="white", max_font_size=300).generate_from_frequencies(data)
plt.figure(figsize=(14,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()
result = wc.to_file("English_word_cloud.png")
printmd("### These are the Most Frequently Used Restaurant Names in English")

In [None]:
from bnlp.corpus import stopwords, punctuations
regex = r"[\u0980-\u09FF]+" 
data = non_en_bd_restaurant.name.value_counts().to_dict()

wc = WordCloud(width=800, height=400,background_color="white", max_font_size=300, font_path="./kalpurush.ttf", regexp=regex).generate_from_frequencies(data)
plt.figure(figsize=(14,10))
plt.imshow(wc, interpolation="bilinear")
plt.axis('off')
plt.show()
result = wc.to_file("Bangla_word_cloud.png")
printmd("### These are the Most Frequently Used Restaurant Names in Bangla")

<a id='4.2'></a>
## 4.2 Heat Map 

In [None]:
import geopandas
import folium
from folium.plugins import MarkerCluster, HeatMap

geometry = geopandas.points_from_xy(bd_rest_df.longitude, bd_rest_df.latitude)
geo_df = geopandas.GeoDataFrame(bd_rest_df[['longitude', 'latitude']], geometry=geometry)

geo_df.head()

bd_coordinate = [23.6850, 90.3563]

site_map = folium.Map(location=bd_coordinate, tiles='Cartodb dark_matter', zoom_start=8)
heat_data = [[point.xy[1][0], point.xy[0][0]] for point in geo_df.geometry ]

# heat_data
HeatMap(heat_data).add_to(site_map)

site_map

In [None]:
bd_coordinate = [23.6850, 90.3563]
site_map = folium.Map(location=bd_coordinate, zoom_start=7)

data = bd_rest_df[bd_rest_df['affluence'].notna()==True]

for i in range(0, len(data)):
    folium.Marker(
        location=[data.iloc[i]['latitude'], data.iloc[i]['longitude']],
        popup=data.iloc[i]['name'],
        tooltip=str(data.iloc[i]['name'])+','+str(data.iloc[i]['affluence'])
    ).add_to(site_map)
site_map

In [None]:
bd_coordinate = [23.6850, 90.3563]
circle_map = folium.Map(location=bd_coordinate, zoom_start=8, prefer_canvas=True,)
data = bd_rest_df[bd_rest_df['affluence'].notna()==True]

data['number_of_reviews'].fillna(0, inplace=True)
data['number_of_reviews'] = data['number_of_reviews'].astype(int, errors='ignore')

occurences = folium.map.FeatureGroup()

n_mean = data['number_of_reviews'].mean()

for lat, lng, number, name in zip(data['latitude'],
                                        data['longitude'],
                                        data['number_of_reviews'], data['name']):
  occurences.add_child(
      folium.vector_layers.CircleMarker(
          [lat, lng],
          radius=number/(n_mean/3), # radius for number of occurrences
          color='yellow',
          fill=True,
          fill_color='blue',
          fill_opacity=0.4,
          tooltip=str(number)+','+str(name),
          # get more from tooltip https://github.com/python-visualization/folium/issues/1010#issuecomment-435968337
      )
  )

circle_map.add_child(occurences)



In [None]:
data = bd_rest_df[bd_rest_df['affluence'].notna()==True]
data_expensive = data[data['affluence'] == "$$$"]
data_expensive[['name','latitude', 'longitude', 'rating', 'number_of_reviews', 'affluence', 'address']]

In [None]:
data = bd_rest_df[bd_rest_df['affluence'].notna()==True]
data_expensive = data[data['affluence'] == "$$$"]


bd_coordinate = [23.6850, 90.3563]
expensive_map = folium.Map(location=bd_coordinate, zoom_start=10, prefer_canvas=True,)

for i in range(0, len(data_expensive)):
    folium.Marker(
        location=[data_expensive.iloc[i]['latitude'], data_expensive.iloc[i]['longitude']],
        # popup=data_expensive.iloc[i]['name'],
        tooltip=str(data_expensive.iloc[i]['name'])+','+str(data_expensive.iloc[i]['rating'])
    ).add_to(expensive_map)

expensive_map

In [None]:
data = bd_rest_df[bd_rest_df['affluence'].notna()==True]
data_very_expensive = data[data['affluence'] == "$$$$"]
data_very_expensive[['name', 'rating', 'number_of_reviews']]

In [None]:
data = bd_rest_df[bd_rest_df['affluence'].notna()==True]
data_very_expensive = data[data['affluence'] == "$$$$"]


bd_coordinate = [23.6850, 90.3563]
very_expensive_map = folium.Map(location=bd_coordinate, zoom_start=10, prefer_canvas=True,)

for i in range(0, len(data_very_expensive)):
    folium.Marker(
        location=[data_very_expensive.iloc[i]['latitude'], data_very_expensive.iloc[i]['longitude']],
        # popup=data_expensive.iloc[i]['name'],
        tooltip=str(data_very_expensive.iloc[i]['name'])+','+str(data_very_expensive.iloc[i]['rating'])
    ).add_to(very_expensive_map)

very_expensive_map