In [484]:
import pandas as pd
import ast

In [486]:
df_business = pd.read_csv('../dataset/df_business.csv')

# Process df_business

## Expand features

In [485]:
import ast

def expand(row, dict):
    for key, values in dict.items():
        row[key] = values
    return row

# Called with: hours, attributes, BusinessParking, GoodForMeal, Ambience, Music, BestNights
def expand_row(row, col_name):
    try:
        return expand(row, ast.literal_eval(row[col_name]))
    except:
        return row

In [488]:
items_to_expand = ['hours', 'attributes', 'BusinessParking', 'GoodForMeal', 'Ambience', 'Music', 'BestNights']

for item_to_expand in items_to_expand:
    df_business = df_business.apply(expand_row, col_name=item_to_expand, axis=1)
    df_business.drop(item_to_expand, axis=1, inplace=True)

In [None]:
df_business.head(5)

## Drop non essential features

In [501]:
to_drop = ['Alcohol', 'NoiseLevel', 'RestaurantsAttire', 'Smoking', 'WiFi', 'address', 'categories', 'city', 'latitude', 'longitude', 'name', 'state', 'postal_code', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'BYOB', 'BYOBCorkage', 'Caters', 'HairSpecializesIn', 'divey', 'lot', 'validated']

df_business.drop(to_drop, axis=1, inplace=True)

In [503]:
df_business.head(5)

Unnamed: 0,AcceptsInsurance,AgesAllowed,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,ByAppointmentOnly,CoatCheck,Corkage,DietaryRestrictions,DogsAllowed,...,no_music,review_count,romantic,stars,street,touristy,trendy,upscale,valet,video
0,,,,,,True,,,,,...,,7,,5.0,,,,,,
1,,,,,True,,,,,,...,,15,,3.0,,,,,,
2,,,True,,True,False,False,,,False,...,,22,,3.5,False,,,,False,
3,,,True,,False,False,,,,,...,,80,,4.0,True,,,,False,
4,,,True,,True,,,,,,...,,13,,4.5,,,,,False,
5,,,False,,True,False,False,,,False,...,,6,,2.0,,,,,,
6,,,True,,True,,,,,,...,,13,,2.5,False,,,,False,
7,,,,,,,,,,,...,,5,,3.5,,,,,,
8,,,,,True,,,,,,...,,19,False,3.0,False,False,False,False,False,
9,,,,,True,False,False,,,False,...,,10,,1.5,False,,,,False,


## Export

In [None]:
df_business.to_csv(r'/Users/at181903/PycharmProjects/IAproject/dataset/df_business_processed.csv', index=False)

# Build df_business_final

## Merge with other dataset

### Merge with checkin

In [504]:
df_checkin = pd.read_csv('../dataset/df_checkin.csv')

df_checkin.head(5)

Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."
2,--30_8IhuyMHbSOcNWd6DQ,"2013-06-14 23:29:17, 2014-08-13 23:20:22"
3,--7PUidqRWpRSpXebiyxTg,"2011-02-15 17:12:00, 2011-07-28 02:46:10, 2012..."
4,--7jw19RH9JKXgFohspgQw,"2014-04-21 20:42:11, 2014-04-28 21:04:46, 2014..."


In [505]:
df_business_merged = pd.merge(df_business, df_checkin, on='business_id')
df_business_merged.shape

(131930, 54)

### Merge with tip

In [506]:
df_tip = pd.read_csv('../dataset/df_tip.csv')

df_tip = df_tip[['business_id', 'compliment_count']]

df_tip.head(5)

Unnamed: 0,business_id,compliment_count
0,3uLgwr0qeCNMjKenHJwPGQ,0
1,QoezRbYQncpRqyrLH6Iqjg,0
2,MYoRNLb5chwjQe3c_k37Gg,0
3,hV-bABTK-glh5wj31ps_Jw,0
4,_uN0OudeJ3Zl_tf6nxg5ww,0


In [507]:
df_business_merged = pd.merge(df_business_merged, df_tip, on='business_id')
df_business_merged.shape

(903105, 55)

## Text processing

In [508]:
# df_business = df_business.applymap(lambda x: bool(x) if type(x) == str else x)

### From NaN/None to 0.0

In [509]:
df_business = df_business.fillna(0.0)
df_business = df_business.replace('None', 0.0)

### From True/False to float

In [510]:
df_business = df_business.replace('True', 1.0)
df_business = df_business.replace(True, 1.0)
df_business = df_business.replace('False', 0.0)
df_business = df_business.replace(False, 0.0)

### From Int to float

In [511]:
df_business = df_business.replace('1', 1.0)
df_business = df_business.replace(1, 1.0)
df_business = df_business.replace('0', 0.0)
df_business = df_business.replace(0, 0.0)

df_business['is_open'] = pd.to_numeric(df_business['is_open'], downcast='float')
df_business['review_count'] = pd.to_numeric(df_business['review_count'], downcast='float')
df_business['RestaurantsPriceRange2'] = pd.to_numeric(df_business['RestaurantsPriceRange2'], downcast='float')

## Export

In [512]:
df_business.head(5)

Unnamed: 0,AcceptsInsurance,AgesAllowed,BikeParking,BusinessAcceptsBitcoin,BusinessAcceptsCreditCards,ByAppointmentOnly,CoatCheck,Corkage,DietaryRestrictions,DogsAllowed,...,no_music,review_count,romantic,stars,street,touristy,trendy,upscale,valet,video
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,15.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,22.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,80.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,13.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0


In [513]:
df_business.to_csv(r'/Users/at181903/PycharmProjects/IAproject/dataset/df_business_final.csv', index=False)

In [None]:
##### OLD
# df_business = pd.read_csv('../dataset/df_business.csv')

# df_business = df_business.dropna() # da togliere

# re.sub("u"no"", "u-no", txt)
# x = re.findall('"BusinessParking": "{.*}"', txt)
# print("x", x)
#
# par = re.findall('"{.*}"', x[0])
# print("par", par)
#
# y = re.sub('{"', '{-"', par[0])
# print("y", y)
#
# z = re.sub('":', '"-:', y)
# print("z", z)
#
# p = re.sub('\s"', ' -"', z)
# print("p", p)
#
# test = txt.replace(par[0], p)
# print("test", test)