In [None]:
#  Copyright (C) <2022>  <MontyPython Group>
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <https://www.gnu.org/licenses/>.

# Preprocessing for business_task
Here we are going to preprocess business, checkin and tip datasets in order to create one dataset to use in `business_task.ipynb`.


### Used libraries
[Pandas](https://pandas.pydata.org/): most common, open source data analysis and manipulation tool.

[NumPy](https://numpy.org/): used for mathematical support for some processes.

In [534]:
import pandas as pd
import ast

## Process df_business

### Get business dataset
Business dataset consists of the following features:
   - `business_id`: [string, 22 character unique string business id]
   - `name`: [string, the business's name]
   - `address`: [string, the full address of the business]
   - `city`: [string, the city]
   - `state`: [string, 2 character state code, if applicable]
   - `postal code`: [string, the postal code]
   - `latitude`: [float, latitude]
   - `longitude`: [float, longitude]
   - `stars`: [float, star rating, rounded to half-stars]
   - `review_count`: [integer, number of reviews]
   - `is_open`: [integer, 0 or 1 for closed or open, respectively]
   - `attributes`: [object, business attributes to values. note: some attribute values might be objects]
   - `categories`: [an array of strings of business categories]
   - `hours`: [an object of key day to value hours, hours are using a 24hr clock]

Attributes is a complex object (a nested json in this case), because of this we have get all nested elements and brought them to the main level

In [535]:
df_business = pd.read_csv('../dataset/df_business.csv')

### Expand features
We find some items that could be themselves complex objects, they are:
- hours
- attributes
- BusinessParking
- GoodForMeal
- Ambience
- Music
- BestNights
- DietaryRestrictions

For each we get all items inside and brought to the main level. In this way we'll be able to use it as features.


In [536]:
import ast

def expand(row, dict):
    for key, values in dict.items():
        row[key] = values
    return row

def expand_row(row, col_name):
    try:
        return expand(row, ast.literal_eval(row[col_name]))
    except:
        return row

In [537]:
items_to_expand = ['hours', 'attributes', 'BusinessParking', 'GoodForMeal', 'Ambience', 'Music', 'BestNights', 'DietaryRestrictions']

for item_to_expand in items_to_expand:
    df_business = df_business.apply(expand_row, col_name=item_to_expand, axis=1)
    df_business.drop(item_to_expand, axis=1, inplace=True)

In [538]:
df_business.head(5)

Unnamed: 0,AcceptsInsurance,AgesAllowed,Alcohol,BYOB,BYOBCorkage,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,ByAppointmentOnly,Caters,...,touristy,trendy,tuesday,upscale,valet,validated,vegan,vegetarian,video,wednesday
0,,,,,,,,,True,,...,,,,,,,,,,
1,,,,,,,,True,,,...,,,,,,,,,,
2,,,,,,True,,True,False,False,...,,,,,False,False,,,,
3,,,u'none',,,True,,False,False,True,...,,,,,False,False,,,,
4,,,,,,True,,True,,False,...,,,,,False,,,,,


### Drop non essential features
We identified some features that were not necessary for the purpose (e.g. NoiseLevel, RestaurantsAttire) and some others not well formatted (e.g. WiFi, Weekdays). All of these have been dropped from dataframe.

In [539]:
to_drop = ['Alcohol', 'NoiseLevel', 'RestaurantsAttire', 'Smoking', 'WiFi', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'BYOB', 'BYOBCorkage', 'Caters', 'HairSpecializesIn', 'divey', 'lot', 'validated', 'AgesAllowed']

df_business.drop(to_drop, axis=1, inplace=True)

In [540]:
df_business.head(5)

Unnamed: 0,AcceptsInsurance,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,ByAppointmentOnly,CoatCheck,Corkage,DogsAllowed,DriveThru,GoodForDancing,...,stars,state,street,touristy,trendy,upscale,valet,vegan,vegetarian,video
0,,,,,True,,,,,,...,5.0,CA,,,,,,,,
1,,,,True,,,,,,,...,3.0,MO,,,,,,,,
2,,True,,True,False,False,,False,,,...,3.5,AZ,False,,,,False,,,
3,,True,,False,False,,,,,,...,4.0,PA,True,,,,False,,,
4,,True,,True,,,,,,,...,4.5,PA,,,,,False,,,


## Build df_business_final

### Merge with other dataset
Based on `business_id` column we merged business, tip and checkin dataset in order to add `compliment_count` and `date` features to the main dataframe.

#### Merge with checkin

In [541]:
df_checkin = pd.read_csv('../dataset/df_checkin.csv')

df_checkin.head(5)

Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22"
3,--7PUidqRWpRSpXebiyxTg,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012..."
4,--7jw19RH9JKXgFohspgQw,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014..."


In [542]:
df_business_merged = pd.merge(df_business, df_checkin, on='business_id')
df_business_merged.shape

(131930, 67)

#### Merge with tip

In [543]:
df_tip = pd.read_csv('../dataset/df_tip.csv')

df_tip = df_tip[['business_id', 'compliment_count']]

df_tip.head(5)

Unnamed: 0,business_id,compliment_count
0,3uLgwr0qeCNMjKenHJwPGQ,0
1,QoezRbYQncpRqyrLH6Iqjg,0
2,MYoRNLb5chwjQe3c_k37Gg,0
3,hV-bABTK-glh5wj31ps_Jw,0
4,_uN0OudeJ3Zl_tf6nxg5ww,0


In [544]:
df_business_merged = pd.merge(df_business_merged, df_tip, on='business_id')
df_business_merged.shape

(903105, 68)

## Dataset processing
We processed dataset, in order to make it compatible with K-Means and DBSCAN algorithms, in this way:
- Each 'Nan' and 'None' value has been converted to 0.0
- Each True and 'True' value has been converted to 1.0
- Each False and 'False' value has been converted to 0.0
- Each integer value (e.g. 1) has been converted to correspondent float value (e.g. 1.0)

In [545]:
# df_business = df_business.applymap(lambda x: bool(x) if type(x) == str else x)

### From NaN/None to 0.0

In [546]:
df_business = df_business.fillna(0.0)
df_business = df_business.replace('None', 0.0)

### From True/False to float

In [547]:
df_business = df_business.replace('True', 1.0)
df_business = df_business.replace(True, 1.0)
df_business = df_business.replace('False', 0.0)
df_business = df_business.replace(False, 0.0)

### From Int to float

In [548]:
df_business = df_business.replace('1', 1.0)
df_business = df_business.replace(1, 1.0)
df_business = df_business.replace('0', 0.0)
df_business = df_business.replace(0, 0.0)

df_business['is_open'] = pd.to_numeric(df_business['is_open'], downcast='float')
df_business['review_count'] = pd.to_numeric(df_business['review_count'], downcast='float')
df_business['RestaurantsPriceRange2'] = pd.to_numeric(df_business['RestaurantsPriceRange2'], downcast='float')

## Export
We have exported the processed dataset in order to use it in `business_task.ipynb`

In [1]:
print(df_business.shape)
df_business.head(5)

NameError: name 'df_business' is not defined

In [550]:
df_business.to_csv(r'/Users/at181903/PycharmProjects/IAproject/dataset/df_business_final.csv', index=False)

In [551]:
##### OLD
# df_business = pd.read_csv('../dataset/df_business.csv')

# df_business = df_business.dropna() # da togliere

# re.sub("u"no"", "u-no", txt)
# x = re.findall('"BusinessParking": "{.*}"', txt)
# print("x", x)
#
# par = re.findall('"{.*}"', x[0])
# print("par", par)
#
# y = re.sub('{"', '{-"', par[0])
# print("y", y)
#
# z = re.sub('":', '"-:', y)
# print("z", z)
#
# p = re.sub('\s"', ' -"', z)
# print("p", p)
#
# test = txt.replace(par[0], p)
# print("test", test)