In [1]:
import pandas as pd 
import numpy as np
from langdetect import detect
import seaborn as sns
import io 
#from s3_credentials import *

In [2]:
import boto3

In [3]:
# boto session 
YOUR_ACCESS_KEY = 
YOUR_SECRET_KEY = 

session = boto3.Session(aws_access_key_id= YOUR_ACCESS_KEY, 
                        aws_secret_access_key= YOUR_SECRET_KEY)

s3 = session.resource("s3")
client = session.client("s3")

### Pull datasets

In [4]:
#Pull subdatasets

# real
obj = s3.Object('jedha-fake-reviews-project', "datasets/real_reviews_raw.csv")
real = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)

# fake
obj = s3.Object('jedha-fake-reviews-project', "datasets/fake_reviews_raw.csv")
fake = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)


### Merge both datasets

In [5]:
# find order columned list
columns_list_ordered = real.columns.values.tolist()
columns_list_ordered.append('url_not_recommended')

# create mapping table 
pivot_table = real.loc[:, ['url','restaurant_average_rating','restaurant_reviews_count','restaurant_expensiveness','restaurant_name']].drop_duplicates('url')

# merge fake with mapping table
fake_merged = pd.merge(fake, pivot_table, left_on='url', right_on = 'url', how='left')

fake_dataset_reworked = fake_merged.loc[:, columns_list_ordered]
# second method --> fake_dataset_reworked = fake_merged.reindex(columns = columns_list_ordered) 
# this method requires to add suffixes=("_Drop", None) to previosu merge


# concat both and put into full_dataset
full_dataset = pd.concat([real, fake_dataset_reworked])
#if restaurant_expensiveness = 14 or 10, it is a miss-scrap, it means the information is not available on the page so we set the value to N/C
full_dataset.loc[(full_dataset['restaurant_expensiveness'] == '14' ) | (full_dataset['restaurant_expensiveness'] == '10' ), 'restaurant_expensiveness'] = 'N/C'
full_dataset = full_dataset.drop_duplicates()
full_dataset = full_dataset.reset_index(drop = True)
full_dataset['url_not_recommended'] = full_dataset['url'].apply(lambda url: url.replace("biz/", "not_recommended_reviews/"))

In [6]:
len(full_dataset)

102828

### (Optional) Function to return language for each 

In [7]:
from langdetect import detect
# def function 
def get_lang(x):

    try:
        lang = detect(str(x))
    except:
        lang = 'not_found'

    return lang

In [8]:
# apply to df 
full_dataset['language'] = full_dataset["text_review"].apply(lambda x : get_lang(x))

In [9]:
full_dataset

Unnamed: 0,date,username,photos_for_review,rating,text_review,user_location,user_friends_count,user_reviews_count,user_total_image_posted,restaurant_average_rating,restaurant_reviews_count,restaurant_expensiveness,restaurant_name,url,is_real_review,url_not_recommended,language
0,21/08/2007,Not_Yelp_User,0,5.0,Bon retour !\nJe suis revenue dans ce resto ap...,"Levallois-Perret, Hauts-de-Seine",0.0,4.0,0.0,3.0,19.0,N/C,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,1.0,https://www.yelp.fr/not_recommended_reviews/re...,fr
1,140,Not_Yelp_User,0,1.0,"Très déçue!\n\nDes erreurs dans les commandes,...",Paris,11.0,250.0,140.0,3.0,19.0,N/C,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,1.0,https://www.yelp.fr/not_recommended_reviews/re...,fr
2,26/07/2006,Benjamin D.,0,3.0,A optimiser...\nCuisine très traditionnelle da...,"Bron, Rhône",0.0,22.0,0.0,3.0,19.0,N/C,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,1.0,https://www.yelp.fr/not_recommended_reviews/re...,fr
3,14/10/2004,Not_Yelp_User,0,3.0,Brasserie chic\nUne brasserie authentiquement ...,Marseille,11.0,155.0,0.0,3.0,19.0,N/C,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,1.0,https://www.yelp.fr/not_recommended_reviews/re...,fr
4,11/02/2007,Not_Yelp_User,0,4.0,Tres bien\nPetit diner entre amis. Les plats e...,"Boulogne-Billancourt, Hauts-de-Seine",0.0,10.0,0.0,3.0,19.0,N/C,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,1.0,https://www.yelp.fr/not_recommended_reviews/re...,fr
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102823,03/04/2014,Biz N.,-1,5.0,"Du choix, un service extrêmement rapide, le re...","Franconville, Val-d'Oise",0.0,2.0,,5.0,6.0,1,Good Time,https://www.yelp.fr/biz/good-time-montigny-l%C...,0.0,https://www.yelp.fr/not_recommended_reviews/go...,fr
102824,23/08/2010,Not Yelp User,-1,5.0,"Vraiment un des meilleur kebab du coin, servic...","Bezons, Val-d'Oise",1.0,4.0,,5.0,6.0,1,Good Time,https://www.yelp.fr/biz/good-time-montigny-l%C...,0.0,https://www.yelp.fr/not_recommended_reviews/go...,fr
102825,03/11/2016,Marissa S.,-1,1.0,Très déçu!!!\nCe soir j'ai eu envie de manger ...,"Sannois, Val-d'Oise",0.0,1.0,,3.0,1.0,N/C,Restaurant Istanbul,https://www.yelp.fr/biz/restaurant-istanbul-fr...,0.0,https://www.yelp.fr/not_recommended_reviews/re...,fr
102826,18/04/2017,Costanovic G.,-1,5.0,J'y vais depuis le début mais j'avoue qu'avec ...,"Franconville, Val-d'Oise",0.0,1.0,,3.0,1.0,N/C,Restaurant Istanbul,https://www.yelp.fr/biz/restaurant-istanbul-fr...,0.0,https://www.yelp.fr/not_recommended_reviews/re...,fr


### Push to bucket

In [10]:
# upload new dataset 
# set path and bucket name
PATH = "datasets/full_dataset.csv"
bucket = s3.Bucket(name = "jedha-fake-reviews-project")
# export dataset as csv
data = full_dataset.to_csv()
put_object = bucket.put_object(ACL='private', Key= PATH, Body=data)
#check 
for obj in bucket.objects.all():
    print(obj.key)

datasets/fake_reviews_raw.csv
datasets/full_dataset.csv
datasets/full_dataset_reworked.csv
datasets/prediction_meta_data.csv
datasets/predictions_svm_nlp.csv
datasets/real_reviews_raw.csv
datasets/svc_predictions_meta_data.csv


In [11]:
bucket = s3.Bucket(name = "jedha-fake-reviews-project")
for obj in bucket.objects.all():
    print(obj.key)

datasets/fake_reviews_raw.csv
datasets/full_dataset.csv
datasets/full_dataset_reworked.csv
datasets/real_reviews_raw.csv


In [19]:
# real
obj = s3.Object('jedha-fake-reviews-project', "datasets/full_dataset.csv")
full_dataset = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)

### Clean_dataset


In [None]:
#_____________________________________________________________________
######### Cleaning the dataset and adding new columns #########
#_____________________________________________________________________

#we drop rows in which restaurant infos are not available (miss scraped)
dataset = dataset.dropna(subset = ['restaurant_average_rating', 'restaurant_reviews_count', 'restaurant_expensiveness', 'restaurant_name'])

#adding a column with the length of the text review
dataset['text_length'] = dataset['text_review'].apply(lambda x : len(x))

#_____________________________________________________________________
######### Fixing existing columns values and types #########
#_____________________________________________________________________

#for the user_total_image_posted column, if user_total_image_posted is NA it means there is there's no image
    # so we set the value to 0
dataset.loc[dataset['user_total_image_posted'].isna(), 'user_total_image_posted'] = 0

#for the date column,  there is some miss scraps that we want to fix
    # a correct data must have a length of 10 , if it is smaller than 10 it's becasue we scrapped the number of images of the user instead
    # we may have to scrap again those lines to fix it
    # we keep only the rows where the date is correct 
mask_not_date = dataset['date'].apply(lambda x: len(x)) < 10
dataset = dataset.loc[mask_not_date == False, :]
    # if te length is greater than 10 is it is beacause we scraped the date + somme additional words ('Avis mis à jour') so we will keep only the part with the date
mask_date_to_fix = dataset['date'].apply(lambda x: len(x)) > 10
dataset.loc[mask_date_to_fix, 'date' ] = dataset.loc[mask_date_to_fix, 'date' ].str.split('\n').str[0]
    #finally we can convert the date column to a datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

#for the language column,
    # We set all languages other than fr and eu to other to avoiding having lots of categories (causing troubles with OneHotEncoder)
mask_is_not_fr_en = (dataset['language'] != 'fr') & (dataset['language'] != 'en')
dataset.loc[mask_is_not_fr_en ,'language'] = 'other'

#for the username column,
    # Setting Not Yelp User = 'Not_Yelp_User
dataset.loc[dataset['username'] == 'Not Yelp User', 'username' ] = 'Not_Yelp_User'

#for the photos_for_review column, 
    # value -1 is in fact 0 (no photos found by the scraper)
dataset.loc[dataset['photos_for_review'] == '-1.0', 'photos_for_review' ] = '0'
    # value L is in fact 0 (no photos found by the scraper but scraped the first letter of "L'avis du jour" which happens when the reviews was updated by the user)
dataset.loc[dataset['photos_for_review'] == 'L', 'photos_for_review' ] = '0'
    # finally we can convert the photos_for_review column to an int format
dataset['photos_for_review'] = dataset['photos_for_review'].astype('int')

#for the photos_for_review column, 
    # when there's no info about the expensiveness we set it to -1
dataset.loc[dataset['restaurant_expensiveness'] == 'N/C', 'restaurant_expensiveness']  = -1
    # we can convert the restaurant_expensiveness column to an int format
dataset['restaurant_expensiveness'] = dataset['restaurant_expensiveness'].astype('int')

#for the is_real_review column, 
    # when reverse the values 0 and 1 and rename the column is_fake_review it will make our work easier with sklearn features
dataset['is_real_review'] = dataset['is_real_review'].apply(lambda x: 1 if x == 0 else 0)
dataset.rename(columns={'is_real_review': 'is_fake_review'}, inplace=True)

# reset index 
dataset = dataset.reset_index(drop = True)
