In [68]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

def read_data_from_csv():
    hotels=pd.read_csv('zomato.csv')
    return hotels

In [69]:
def remove_unwanted_columns():
    #DO NOT REMOVE FOLLOWING LINE
    #call read_data_from_csv() function to get dataframe
    hotels=read_data_from_csv()
    hotels.drop(["address","phone"],axis=1,inplace=True)
    return hotels

In [70]:
def rename_columns():
    
    #call remove_unwanted_columns() function to get dataframe
    hotels = remove_unwanted_columns()
    hotels.index.name="Id"
    hotels=hotels.rename(columns={"name":"name","rate":"rating","approx_cost(for two people)":"approx_cost","listed_in(type)":"type"})
    
    return hotels

In [71]:
def null_value_check():
    
    #call rename_columns() function to get dataframe
    hotels=rename_columns()
    
    #deleting null values of name column
    hotels=hotels.dropna(subset="name")
    
    hotels=hotels.fillna({"votes":0,"rating":0,"approx_cost":0,"online_order":"NA","book_table":"NA","location":"NA","rest_type":"NA","dish_liked":"NA","cuisines":"NA","type":"NA"})
    
    return hotels

In [72]:
def find_duplicates():
    
    #call null_value_check() function to get dataframe
    hotels=null_value_check()
    hotels=hotels.drop_duplicates(keep="first")
    
    #droping the duplicates value keeping the first
    return hotels

In [73]:
def removing_irrelevant_text():
    
    #call find_duplicates() function to get dataframe
    hotels= find_duplicates()
    hotels=hotels[~hotels["name"].str.contains('rated', case = False)]
    hotels=hotels[~hotels["online_order"].str.contains('rated', case = False)]
    hotels=hotels[~hotels["book_table"].str.contains('rated', case = False)]
    hotels=hotels[hotels["rating"].str.contains('rated', case = False)==False]
    hotels=hotels[hotels["votes"].str.contains('rated', case = False)==False]
    hotels=hotels[~hotels["location"].str.contains('rated', case = False)]
    hotels=hotels[~hotels["rest_type"].str.contains('rated', case = False)]
    hotels=hotels[~hotels["dish_liked"].str.contains('rated', case = False)]
    hotels=hotels[~hotels["cuisines"].str.contains('rated', case = False)]
    hotels=hotels[hotels["approx_cost"].str.contains('rated', case = False)==False]
    hotels=hotels[~hotels["type"].str.contains('rated', case = False)]
    return hotels

In [74]:
def check_for_unique_values():
    
    #call removing_irrelevant_text() function to get dataframe
    hotels=removing_irrelevant_text()
    hotels=hotels[hotels["online_order"].isin(["Yes","No"])]
    hotels["rating"]=hotels["rating"].str.replace("/5","")
    hotels.rating[hotels.rating == "NEW"] = 0
    hotels.rating[hotels.rating == "-"] = 0
    return hotels

In [75]:
def remove_the_unknown_character():
    
    #call check_for_unique_values() function to get dataframe
    hotels=check_for_unique_values() 

    #remove unknown character from dataset
    
    hotels['name'] = hotels['name'].str.replace("©", "e")

    hotels['name'] = hotels['name'].str.replace("[ÃƒÆ’Ã‚Æ’Ãƒâ€šÃ‚Æ’ÃƒÆ’Ã‚â€šÃƒâ€šÃ‚Æ’ÃƒÆ’Ã‚Æ’Ãƒâ€šÃ‚â€šÃƒÆ’Ã‚â€šÃƒâ€šÃ‚Æ’ÃƒÆ’Ã‚Æ’Ãƒâ€šÃ‚Æ’ÃƒÆ’Ã‚â€šÃƒâ€šÃ‚â€šÃƒÆ’Ã‚Æ’Ãƒâ€šÃ‚â€šÃƒÆ’Ã‚â€šÃƒâ€šÃ‚Â]", "")
    
    hotels["rating"]=hotels["rating"].astype("float")
    
    hotels["votes"]=hotels["votes"].astype("int")
    
    hotels["approx_cost"]=hotels["approx_cost"].str.replace(",","").astype("float")
    
    
    #export cleaned Dataset to newcsv file named "zomatocleaned.csv"
    
    return hotels

In [82]:
def remove_duplicates():

    hotels=remove_the_unknown_character()
    
    hotels=hotels.drop_duplicates(keep="first")
    hotels=hotels.drop_duplicates(subset=["name","location","cuisines"])
    
    hotels=hotels.reset_index(drop=True)
    hotels.index.name="Id"
    
    return hotels

In [85]:
def start():
    hotels=remove_duplicates()
    hotels.to_csv('zomatocleanedBI_2.csv')
    
    return hotels

In [86]:
start()

Unnamed: 0_level_0,name,online_order,book_table,rating,votes,location,rest_type,dish_liked,cuisines,approx_cost,type
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Jalsa,Yes,Yes,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800.0,Buffet
1,Spice Elephant,Yes,No,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800.0,Buffet
2,San Churro Cafe,Yes,No,3.8,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800.0,Buffet
3,Addhuri Udupi Bhojana,No,No,3.7,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300.0,Buffet
4,Grand Village,No,No,3.8,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600.0,Buffet
...,...,...,...,...,...,...,...,...,...,...,...
10448,Chung Wah,No,No,2.5,73,"ITPL Main Road, Whitefield",Casual Dining,Manchow Soup,"Chinese, Momos",800.0,Dine-out
10449,Nawabs Empire,No,No,3.2,5,"ITPL Main Road, Whitefield",Quick Bites,,"North Indian, Chinese, Arabian, Momos",300.0,Dine-out
10450,Fujian Express,Yes,No,3.8,182,"ITPL Main Road, Whitefield",Food Court,"Noodles, Chicken Noodle, Momos, American Chops...","Thai, Chinese, Momos",600.0,Dine-out
10451,SeeYa Restaurant,No,No,3.3,4,KR Puram,Quick Bites,,"North Indian, Kerala, Chinese",350.0,Dine-out
