# Import and Clean DS

In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
import fr_core_news_sm

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold


In [4]:
nlp = fr_core_news_sm.load()

In [1]:
cd '/Users/admin/Jedha/'

/Users/admin/Jedha


In [2]:
from s3_credentials import *

In [5]:
# connect to s3 instance
import boto3
#YOUR_ACCESS_KEY = 
#YOUR_SECRET_KEY = 

session = boto3.Session(aws_access_key_id= YOUR_ACCESS_KEY, 
                        aws_secret_access_key= YOUR_SECRET_KEY)

s3 = session.resource("s3")
client = session.client("s3")

In [6]:
import pandas as pd
import glob
import io

In [9]:
#full DS
obj = s3.Object('jedha-fake-reviews-project', "datasets/full_dataset.csv")
dataset = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)

In [10]:
#_____________________________________________________________________
######### Cleaning the dataset and adding new columns #########
#_____________________________________________________________________

#we drop rows in which restaurant infos are not available (miss scraped)
dataset = dataset.dropna(subset = ['restaurant_average_rating', 'restaurant_reviews_count', 'restaurant_expensiveness', 'restaurant_name'])

#adding a column with the length of the text review
dataset['text_length'] = dataset['text_review'].apply(lambda x : len(x))

#_____________________________________________________________________
######### Fixing existing columns values and types #########
#_____________________________________________________________________

#for the user_total_image_posted column, if user_total_image_posted is NA it means there is there's no image
    # so we set the value to 0
dataset.loc[dataset['user_total_image_posted'].isna(), 'user_total_image_posted'] = 0

#for the date column,  there is some miss scraps that we want to fix
    # a correct data must have a length of 10 , if it is smaller than 10 it's becasue we scrapped the number of images of the user instead
    # we may have to scrap again those lines to fix it
    # we keep only the rows where the date is correct 
mask_not_date = dataset['date'].apply(lambda x: len(x)) < 10
dataset = dataset.loc[mask_not_date == False, :]
    # if te length is greater than 10 is it is beacause we scraped the date + somme additional words ('Avis mis à jour') so we will keep only the part with the date
mask_date_to_fix = dataset['date'].apply(lambda x: len(x)) > 10
dataset.loc[mask_date_to_fix, 'date' ] = dataset.loc[mask_date_to_fix, 'date' ].str.split('\n').str[0]
    #finally we can convert the date column to a datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

#for the photos_for_review column, 
    # value -1 is in fact 0 (no photos found by the scraper)
dataset.loc[dataset['photos_for_review'] == '-1.0', 'photos_for_review' ] = '0'
    # value L is in fact 0 (no photos found by the scraper but scraped the first letter of "L'avis du jour" which happens when the reviews was updated by the user)
dataset.loc[dataset['photos_for_review'] == 'L', 'photos_for_review' ] = '0'
    # finally we can convert the photos_for_review column to an int format
dataset['photos_for_review'] = dataset['photos_for_review'].astype('int')

#for the photos_for_review column, 
    # when there's no info about the expensiveness we set it to -1
dataset.loc[dataset['restaurant_expensiveness'] == 'N/C', 'restaurant_expensiveness']  = -1
    # we can convert the restaurant_expensiveness column to an int format
dataset['restaurant_expensiveness'] = dataset['restaurant_expensiveness'].astype('int')

# change is real review for is fake review as it's better for sklearn 
dataset["is_fake_review"] = dataset["is_real_review"].apply(lambda x: '1' if x == 0 else '0')
dataset["is_fake_review"] = dataset["is_fake_review"].astype(int)
dataset = dataset.drop(columns="is_real_review")

# reset index 
dataset = dataset.reset_index(drop = True)


In [11]:
french_reviews = dataset.loc[dataset['language'] =='fr',['text_review', 'is_fake_review']].reset_index(drop=True)

# Preprocessing for NLP

In [14]:
data = french_reviews.copy()

In [27]:
from unidecode import unidecode

In [28]:
#Clean and Lemmatize the data

#removes '/n', and any non alphanumeric character and finally if there's more than one space in a row it turns it to one space
data["text_review_clean"] = data["text_review"].str.replace(r"[\n]*?[^A-zÀ-ÿ0-9' ]+", ' ').str.replace(r" +"," ").str.lower()
data['text_review_clean'] = data['text_review_clean'].apply(lambda x : unidecode(str(x)))
data["text_review_clean"] = data["text_review_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if token.lemma_ not in STOP_WORDS]))



In [13]:
data = pd.read_csv('dataset_clean_v2.csv', index_col= 0)

# Tokenizing, lemmatizing and deleteing stopwords from doc with Spacy


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [16]:
data['text_review_clean'] = data['text_review_clean'].astype(str)

In [158]:
# apply vectorizer to the review column
vectorizer = TfidfVectorizer(smooth_idf=True, min_df=200)
X = vectorizer.fit_transform(data['text_review_clean'])

In [159]:
len(vectorizer.vocabulary_)

2102

# Topic Extraction

In [19]:
# import from sklearn
from sklearn.decomposition import TruncatedSVD

In [160]:
# set it to 12 different topics 
svd = TruncatedSVD(n_components= 500)
# fit to our matrix --> last two columns are those with the previous cluster_values
lsa = svd.fit_transform(X)

In [165]:
pd.DataFrame(lsa).to_csv('lsa.csv')

In [21]:
print(svd.explained_variance_ratio_.sum())

0.5976252366728456


# Clean Data For Classifier

In [217]:
len_review = data["text_review"].apply(lambda x : len(str(x)))
len_review = pd.qcut(len_review, 2, labels = ['low', 'high'])


In [218]:
data['upper_word_count'] = data['text_review'].apply(lambda x : sum(map(str.isupper, x.split())) )
upper_word_count = pd.qcut(data['upper_word_count'].rank(method = 'first'), 3, labels = ['low', 'mid', 'high'])

In [219]:
data['exclam_count'] = data['text_review'].apply(lambda x : len(''.join(ch for ch in x if ch =='!')))
exclam_count = pd.qcut(data['exclam_count'].rank(method = 'first'), 3, labels = ['low', 'high', 'very_high'])


In [220]:
 df = pd.concat([pd.DataFrame(lsa), len_review, exclam_count, upper_word_count ], axis = 1)

In [221]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [222]:
# split y 
y = data["is_fake_review"]

In [223]:
X_train, X_test, y_train, y_test = train_test_split(df,y, 
                                                    test_size = 0.2,
                                                    stratify = y , ## Statify splitting when you're training a classification model !
                                                    random_state = 19)

In [227]:
# Create pipeline for categorical features

categorical_features = [index for index, c in enumerate(df.columns) if c in ['text_review','exclam_count', 'upper_word_count' ] ]
numerical_features = [index for index, c in enumerate(df.columns) if c not in ['text_review','exclam_count', 'upper_word_count' ] ] # Positions of categorical columns in X_train/X_test

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first'))
    ])# first column will be dropped to avoid creating correlations between features

In [228]:
# Create pipeline for numeric features

# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [229]:
X_train.shape

(72477, 503)

In [230]:
# Preprocessings on train set

X_train = preprocessor.fit_transform(X_train)

# Preprocessings on test set
X_test = preprocessor.transform(X_test) 

#Whole Set

#X_w = preprocessor.transform(df)


In [231]:
X_train.shape

(72477, 505)

In [72]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold


In [90]:
kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 

parameters= {'C': [10], \
            'gamma': [1] ,
             "class_weight": [{1:0.67, 0:0.33}, {1:0.75, 0:0.25}, {1:0.8, 0:0.2}, "balanced"] \
           }

model = SVC()
model_svc =GridSearchCV(model, parameters, cv=kfold, verbose=2, scoring="f1")
model_svc.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.9min
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.9min remaining:    0.0s


[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.6min
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.7min
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.8min
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.7min
[CV] C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1, total= 1.9min
[CV] C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1, total= 1.8min
[CV] C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1, total= 1.6min
[CV] C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1 ..................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 43.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=SVC(),
             param_grid={'C': [10],
                         'class_weight': [{0: 0.33, 1: 0.67},
                                          {0: 0.25, 1: 0.75}, {0: 0.2, 1: 0.8},
                                          'balanced'],
                         'gamma': [1]},
             scoring='f1', verbose=2)

In [232]:
# Train model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=0.95, class_weight={0:1, 1:2.1}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


model.fit(X_train, y_train)

LogisticRegression(C=0.95, class_weight={0: 1, 1: 2.1})

In [233]:
from sklearn.metrics import f1_score
print('f1 test :',f1_score(y_test, model.predict(X_test)))
print('f1 train :',f1_score(y_train, model.predict(X_train)))

f1 test : 0.695471744171854
f1 train : 0.7045908797392455


In [164]:
svc_model = SVC(C=1.0, break_ties=False, cache_size=200, class_weight={0:1, 1:1.8}, coef0=0.0,
                decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
                max_iter=-1, probability=False, random_state=None, shrinking=True,
                tol=0.001, verbose=False)

svc_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score
print('f1 test :',f1_score(y_test, svc_model.predict(X_test)))
print('f1 train :',f1_score(y_train, svc_model.predict(X_train)))

In [95]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
    
print("Scores for model on test set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_test,test_pred))))
print('Precision Score : {}'.format(str(precision_score(y_test,test_pred))))
print('Recall Score : {}' .format(str(recall_score(y_test,test_pred ))))
print('F1 Score : {}'.format(str(f1_score(y_test,test_pred))))
    
print("")
print("")
print("Scores for model on train set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_train,train_pred))))
print('Precision Score : {}'.format(str(precision_score(y_train,train_pred))))
print('Recall Score : {}' .format(str(recall_score(y_train,train_pred))))
print('F1 Score : {}'.format(str(f1_score(y_train,train_pred))))

Scores for model on test set

Accuracy Score : 0.8770317615300672
Precision Score : 0.5947910357359176
Recall Score : 0.7097940007228045
F1 Score : 0.6472235953204811


Scores for model on train set

Accuracy Score : 0.8775703618609995
Precision Score : 0.5962121212121212
Recall Score : 0.7111874209289716
F1 Score : 0.6486441935218


In [97]:
svc_clf2 = SVC(C=10, class_weight={0: 0.33, 1: 0.67}, gamma=1,  probability=True)

In [98]:
svc_clf2.fit(X_train,y_train)

SVC(C=10, class_weight={0: 0.33, 1: 0.67}, gamma=1, probability=True)

In [110]:
# set path and bucket name
PATH = "datasets/predictions_svm_nlp.csv"
bucket = s3.Bucket(name = "jedha-fake-reviews-project")
# export dataset as csv
data = predictions_svm_nlp.to_csv()

#upload to bucket
put_object = bucket.put_object(ACL='private', Key= PATH, Body=data)
#check 
for obj in bucket.objects.all():
    print(obj.key)

datasets/fake_reviews_raw.csv
datasets/full_dataset.csv
datasets/full_dataset_reworked.csv
datasets/predictions_svm_nlp.csv
datasets/real_reviews_raw.csv


In [192]:
import joblib

In [193]:

# Save to file in the current working directory
joblib_file = "text_vectorizer.pkl"
joblib.dump(vectorizer, joblib_file)




['text_vectorizer.pkl']

In [194]:
# Save to file in the current working directory
joblib_file = "topic_extractor.pkl"
joblib.dump(svd, joblib_file)


['topic_extractor.pkl']

In [195]:
# Save to file in the current working directory
joblib_file = "main_model.pkl"
joblib.dump(model, joblib_file)

['main_model.pkl']

In [234]:
# Save to file in the current working directory
joblib_file = "preprocessor.pkl"
joblib.dump(preprocessor, joblib_file)

['preprocessor.pkl']