In [5]:
import pandas as pd 
import numpy as np
from langdetect import detect
import seaborn as sns
import io 
#from s3_credentials import *

In [6]:
import boto3

In [7]:
# boto session 
#YOUR_ACCESS_KEY = 
#YOUR_SECRET_KEY = 

session = boto3.Session(aws_access_key_id= YOUR_ACCESS_KEY, 
                        aws_secret_access_key= YOUR_SECRET_KEY)

s3 = session.resource("s3")
client = session.client("s3")

### Pull datasets

In [8]:
#Pull subdatasets

# real
obj = s3.Object('jedha-fake-reviews-project', "datasets/real_reviews_raw.csv")
real = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)

# fake
obj = s3.Object('jedha-fake-reviews-project', "datasets/fake_reviews_raw.csv")
fake = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)


### Merge both datasets

In [9]:
# find order columned list
columns_list_ordered = real.columns.values.tolist()
columns_list_ordered.append('url_not_recommended')

# create mapping table 
pivot_table = real.loc[:, ['url','restaurant_average_rating','restaurant_reviews_count','restaurant_expensiveness','restaurant_name']].drop_duplicates('url')

# merge fake with mapping table
fake_merged = pd.merge(fake, pivot_table, left_on='url', right_on = 'url', how='left')

fake_dataset_reworked = fake_merged.loc[:, columns_list_ordered]
# second method --> fake_dataset_reworked = fake_merged.reindex(columns = columns_list_ordered) 
# this method requires to add suffixes=("_Drop", None) to previosu merge


# concat both and put into full_dataset
full_dataset = pd.concat([real, fake_dataset_reworked])
#if restaurant_expensiveness = 14 or 10, it is a miss-scrap, it means the information is not available on the page so we set the value to N/C
full_dataset.loc[(full_dataset['restaurant_expensiveness'] == '14' ) | (full_dataset['restaurant_expensiveness'] == '10' ), 'restaurant_expensiveness'] = 'N/C'
full_dataset = full_dataset.drop_duplicates()
full_dataset = full_dataset.reset_index(drop = True)
full_dataset['url_not_recommended'] = full_dataset['url'].apply(lambda url: url.replace("biz/", "not_recommended_reviews/"))

In [10]:
len(full_dataset)

94592

### (Optional) Function to return language for each 

In [None]:
from langdetect import detect
# def function 
def get_lang(x):

    try:
        lang = detect(str(x))
    except:
        lang = 'not_found'

    return lang

In [None]:
# apply to df 
full_dataset['language'] = full_dataset["text_review"].apply(lambda x : get_lang(x))

### Push to bucket

In [None]:
# upload new dataset 
# set path and bucket name
PATH = "datasets/full_dataset.csv"
bucket = s3.Bucket(name = "jedha-fake-reviews-project")
# export dataset as csv
data = full_dataset.to_csv()
put_object = bucket.put_object(ACL='private', Key= PATH, Body=data)
#check 
for obj in bucket.objects.all():
    print(obj.key)

In [11]:
bucket = s3.Bucket(name = "jedha-fake-reviews-project")
for obj in bucket.objects.all():
    print(obj.key)

datasets/fake_reviews_raw.csv
datasets/full_dataset.csv
datasets/full_dataset_reworked.csv
datasets/real_reviews_raw.csv


In [19]:
# real
obj = s3.Object('jedha-fake-reviews-project', "datasets/full_dataset.csv")
full_dataset = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)

### Clean_dataset


In [None]:
#_____________________________________________________________________
######### Cleaning the dataset and adding new columns #########
#_____________________________________________________________________

#we drop rows in which restaurant infos are not available (miss scraped)
dataset = dataset.dropna(subset = ['restaurant_average_rating', 'restaurant_reviews_count', 'restaurant_expensiveness', 'restaurant_name'])

#adding a column with the length of the text review
dataset['text_length'] = dataset['text_review'].apply(lambda x : len(x))

#_____________________________________________________________________
######### Fixing existing columns values and types #########
#_____________________________________________________________________

#for the user_total_image_posted column, if user_total_image_posted is NA it means there is there's no image
    # so we set the value to 0
dataset.loc[dataset['user_total_image_posted'].isna(), 'user_total_image_posted'] = 0

#for the date column,  there is some miss scraps that we want to fix
    # a correct data must have a length of 10 , if it is smaller than 10 it's becasue we scrapped the number of images of the user instead
    # we may have to scrap again those lines to fix it
    # we keep only the rows where the date is correct 
mask_not_date = dataset['date'].apply(lambda x: len(x)) < 10
dataset = dataset.loc[mask_not_date == False, :]
    # if te length is greater than 10 is it is beacause we scraped the date + somme additional words ('Avis mis à jour') so we will keep only the part with the date
mask_date_to_fix = dataset['date'].apply(lambda x: len(x)) > 10
dataset.loc[mask_date_to_fix, 'date' ] = dataset.loc[mask_date_to_fix, 'date' ].str.split('\n').str[0]
    #finally we can convert the date column to a datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

#for the photos_for_review column, 
    # value -1 is in fact 0 (no photos found by the scraper)
dataset.loc[dataset['photos_for_review'] == '-1.0', 'photos_for_review' ] = '0'
    # value L is in fact 0 (no photos found by the scraper but scraped the first letter of "L'avis du jour" which happens when the reviews was updated by the user)
dataset.loc[dataset['photos_for_review'] == 'L', 'photos_for_review' ] = '0'
    # finally we can convert the photos_for_review column to an int format
dataset['photos_for_review'] = dataset['photos_for_review'].astype('int')

#for the photos_for_review column, 
    # when there's no info about the expensiveness we set it to -1
dataset.loc[dataset['restaurant_expensiveness'] == 'N/C', 'restaurant_expensiveness']  = -1
    # we can convert the restaurant_expensiveness column to an int format
dataset['restaurant_expensiveness'] = dataset['restaurant_expensiveness'].astype('int')

# change is real review for is fake review as it's better for sklearn 
dataset["is_fake_review"] = dataset["is_real_review"].apply(lambda x: '1' if x == 0 else '0')
dataset["is_fake_review"] = dataset["is_fake_review"].astype(int)
dataset = dataset.drop(columns="is_real_review")

# reset index 
dataset = dataset.reset_index(drop = True)
