In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df_reviews = pd.read_feather("../FilteredData/review_philly.feather")
df_users = pd.read_feather("../FilteredData/user_philly.feather")
df_buz = pd.read_feather("../FilteredData/business_philly.feather")

In [4]:
df_reviews.columns

Index(['_id', 'review_id', 'user_id', 'business_id', 'stars', 'useful',
       'funny', 'cool', 'text', 'date', 'compliment_count'],
      dtype='object')

In [5]:
df_users.columns

Index(['index', '_id', 'user_id', 'name', 'review_count', 'yelping_since',
       'useful', 'funny', 'cool', 'elite', 'friends', 'fans', 'average_stars',
       'compliment_hot', 'compliment_more', 'compliment_profile',
       'compliment_cute', 'compliment_list', 'compliment_note',
       'compliment_plain', 'compliment_cool', 'compliment_funny',
       'compliment_writer', 'compliment_photos', 'gender_score',
       'philly_reviews', 'philly_share_of_reviews'],
      dtype='object')

In [6]:
df_buz.columns

Index(['index', '_id', 'business_id', 'name', 'address', 'city', 'state',
       'postal_code', 'latitude', 'longitude', 'stars', 'review_count',
       'is_open', 'attributes', 'categories', 'hours', 'positive_%'],
      dtype='object')

In [7]:
df_users.rename(columns={"review_count":"user_review_count"}, inplace=True)

In [8]:
df_buz.rename(columns={"stars":"business_stars", "review_count":"business_review_count"}, inplace=True)

__user_features:__
* user_review_count (numerical)
* average_stars (numerical)
* gender_score (numerical)
* philly_reviews (numerical)
* philly_share_of_reviews (numerical)
* years_on_yelp ?
    
    
__item_features:__
* postal_code (categorical)
* business_stars (numerical)
* business_review_count (numerical)
* <s>positive_% (numerical)</s>
* attributes?
* categories?

In [13]:
df_buz["postal_code"] = df_buz["postal_code"].fillna(0)

In [14]:
features = ["user_id", "business_id", "stars", "target",
            "user_review_count", "average_stars", "gender_score", "philly_reviews", "philly_share_of_reviews",
           "postal_code", "business_stars", "business_review_count"]

In [15]:
df_merged = df_reviews.merge(
    df_buz, 
    how="left", 
    left_on="business_id", 
    right_on="business_id"
)
df_merged = df_merged.merge(
    df_users,
    how="left",
    left_on="user_id",
    right_on="user_id"
)

In [16]:
def create_target(row):
    if row["stars"] <= 3:
        return 0
    else:
        return 1

In [17]:
df_merged["target"] = df_merged.apply(lambda x: create_target(x), axis=1)

In [18]:
df_merged = df_merged[features]

In [19]:
df_merged.shape[0] == df_reviews.shape[0]

True

In [20]:
df_train, df_val = train_test_split(
    df_merged, 
    test_size=0.2, 
    shuffle=True, 
    random_state=3
)

In [21]:
df_train.to_csv("./data/train.csv", index=False)
df_val.to_csv("./data/valid.csv", index=False)

In [22]:
df_train = pd.read_csv("./data/train.csv")
df_train.shape

(684064, 12)