In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [34]:
df_reviews = pd.read_feather("../FilteredData/review_philly.feather")
df_users = pd.read_feather("../FilteredData/user_philly.feather")
df_buz = pd.read_feather("../FilteredData/business_philly.feather")

In [35]:
df_reviews.columns

Index(['_id', 'review_id', 'user_id', 'business_id', 'stars', 'useful',
       'funny', 'cool', 'text', 'date', 'compliment_count'],
      dtype='object')

In [36]:
df_users.columns

Index(['index', '_id', 'user_id', 'name', 'review_count', 'yelping_since',
       'useful', 'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars',
       'compliment_hot', 'compliment_more', 'compliment_profile',
       'compliment_cute', 'compliment_list', 'compliment_note',
       'compliment_plain', 'compliment_cool', 'compliment_funny',
       'compliment_writer', 'compliment_photos', 'gender_score',
       'philly_reviews', 'philly_share_of_reviews',
       'bucketed_philly_share_of_reviews', 'bucketed_gender_scores',
       'average_stars_given', 'bucketed_average_stars_given',
       'bucketed_philly_reviews'],
      dtype='object')

In [37]:
df_buz.columns

Index(['index', '_id', 'business_id', 'name', 'address', 'city', 'state',
       'postal_code', 'latitude', 'longitude', 'stars', 'review_count',
       'is_open', 'attributes', 'categories', 'hours', 'positive_%',
       'bucketed_average_stars_received', 'bucketed_sentiment_scores_received',
       'bucketed_review_count'],
      dtype='object')

In [38]:
df_users.rename(columns={"review_count":"user_review_count"}, inplace=True)

In [39]:
df_buz.rename(columns={"stars":"business_stars", "review_count":"business_review_count"}, inplace=True)

In [40]:
df_buz["positive_%"].isna().sum()

541

In [41]:
df_buz["postal_code"] = df_buz["postal_code"].fillna(0)

In [42]:
df_reviews[df_reviews.stars.isnull()]

Unnamed: 0,_id,review_id,user_id,business_id,stars,useful,funny,cool,text,date,compliment_count
852117,6327b024504265271679c187,,-copOvldyKh1qr-vzkDEvw,MYoRNLb5chwjQe3c_k37Gg,,,,,It's open even when you think it isn't,2013-08-18 00:56:08,0.0
852118,6327b024504265271679c199,,FQ-zmWPEG_pjSQx6pt3Efw,3ZynJ94VpIdDlaArmEp2Rg,,,,,"Yes, I'm eating here again. Breakfast!",2012-10-12 15:16:13,0.0
852119,6327b024504265271679c1a7,,YnlCpuaBa3qWBp4te8pGmA,XIKYdKWq72zUYsq8NBxcCQ,,,,,The honey glazed salmon is amazing!,2018-01-14 15:00:01,0.0
852120,6327b024504265271679c1b5,,Rr4cLb6Go91FT134o6RsKg,eMiN8nm70jjKg8izikVWDA,,,,,Mmm Yummy Crab Fries!,2011-10-16 23:43:10,0.0
852121,6327b024504265271679c1ba,,fJhr0G2JBNkfqpbIwkEQHg,eJ77e9lGxY3ArzaoDbHhYw,,,,,"Good specials, nice menu",2013-02-16 21:34:37,0.0
...,...,...,...,...,...,...,...,...,...,...,...
855075,6327b03050426527167a1746,,uChvFHK4uSifUXBPJPp9Zw,lCetcbxQr52TTGmGmXzQcw,,,,,Racist!!! Wont serve you if you are ethnic,2013-12-15 04:51:26,1.0
855076,6327b03050426527167a1749,,lP_cx94U8stOuQ8HrGecdQ,03jQGGJ2ch0uHTtW-UUUqg,,,,,Stuff face time.,2012-05-27 14:06:04,0.0
855077,6327b03050426527167a174c,,Rr4cLb6Go91FT134o6RsKg,UEGZ1nHUVb2ltGtd7C7X9w,,,,,Blizzard,2012-06-05 20:34:27,0.0
855078,6327b03050426527167a175b,,pflneuKFKQKQvZGDxebGFw,zsaDvomuMLe_-Ibtk9sA-A,,,,,Stopped by and it was closed on wed. 10/3/12!,2012-10-03 16:33:53,0.0


In [43]:
df_reviews = df_reviews[~df_reviews.stars.isna()]

In [44]:
df_users.replace([np.inf, -np.inf], 1, inplace=True)

In [45]:
features = ["user_id", "business_id", "stars", "target",
            "user_review_count", "average_stars", "philly_reviews", "philly_share_of_reviews", "gender_score", 
            "business_stars", "business_review_count", "bucketed_sentiment_scores_received", "postal_code"]

In [46]:
df_merged = df_reviews.merge(
    df_buz, 
    how="left", 
    left_on="business_id", 
    right_on="business_id"
)
df_merged = df_merged.merge(
    df_users,
    how="left",
    left_on="user_id",
    right_on="user_id"
)

In [47]:
def create_target(row):
    if row["stars"] <= 3:
        return 0
    else:
        return 1

In [48]:
df_merged["target"] = df_merged.apply(lambda x: create_target(x), axis=1)

In [49]:
df_merged = df_merged[features]

In [50]:
df_merged.shape[0] == df_reviews.shape[0]

True

In [51]:
df_train, df_val = train_test_split(
    df_merged, 
    test_size=0.2, 
    shuffle=True, 
    random_state=3
)

In [52]:
df_train.to_csv("./data/train.csv", index=False)
df_val.to_csv("./data/valid.csv", index=False)

In [53]:
df_train = pd.read_csv("./data/train.csv")
df_train.shape

(681693, 13)

In [54]:
df_merged["bucketed_sentiment_scores_received"]

0         more_than_60_up_to_80_percent
1         more_than_60_up_to_80_percent
2                  more_than_80_percent
3                  more_than_80_percent
4         more_than_60_up_to_80_percent
                      ...              
852112    more_than_60_up_to_80_percent
852113             more_than_80_percent
852114    more_than_60_up_to_80_percent
852115    more_than_60_up_to_80_percent
852116    more_than_60_up_to_80_percent
Name: bucketed_sentiment_scores_received, Length: 852117, dtype: object