# Import and Clean DS

In [2]:
# connect to s3 instance
import boto3
YOUR_ACCESS_KEY = 

YOUR_SECRET_KEY = 

session = boto3.Session(aws_access_key_id= YOUR_ACCESS_KEY, 
                        aws_secret_access_key= YOUR_SECRET_KEY)

s3 = session.resource("s3")
client = session.client("s3")

In [3]:
import pandas as pd
import glob
import io

In [4]:
#full DS
obj = s3.Object('jedha-fake-reviews-project', "datasets/full_dataset.csv")
dataset = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)

In [5]:
#_____________________________________________________________________
######### Cleaning the dataset and adding new columns #########
#_____________________________________________________________________

#we drop rows in which restaurant infos are not available (miss scraped)
dataset = dataset.dropna(subset = ['restaurant_average_rating', 'restaurant_reviews_count', 'restaurant_expensiveness', 'restaurant_name'])

#adding a column with the length of the text review
dataset['text_length'] = dataset['text_review'].astype(str).apply(lambda x : len(x))

#_____________________________________________________________________
######### Fixing existing columns values and types #########
#_____________________________________________________________________

#for the user_total_image_posted column, if user_total_image_posted is NA it means there is there's no image
    # so we set the value to 0
dataset.loc[dataset['user_total_image_posted'].isna(), 'user_total_image_posted'] = 0

#for the date column,  there is some miss scraps that we want to fix
    # a correct data must have a length of 10 , if it is smaller than 10 it's becasue we scrapped the number of images of the user instead
    # we may have to scrap again those lines to fix it
    # we keep only the rows where the date is correct 
mask_not_date = dataset['date'].astype(str).apply(lambda x: len(x)) < 10
dataset = dataset.loc[mask_not_date == False, :]
    # if te length is greater than 10 is it is beacause we scraped the date + somme additional words ('Avis mis à jour') so we will keep only the part with the date
mask_date_to_fix = dataset['date'].apply(lambda x: len(x)) > 10
dataset.loc[mask_date_to_fix, 'date' ] = dataset.loc[mask_date_to_fix, 'date' ].str.split('\n').str[0]
    #finally we can convert the date column to a datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

#for the photos_for_review column, 
    # value -1 is in fact 0 (no photos found by the scraper)
dataset.loc[dataset['photos_for_review'] == '-1.0', 'photos_for_review' ] = '0'
    # value L is in fact 0 (no photos found by the scraper but scraped the first letter of "L'avis du jour" which happens when the reviews was updated by the user)
dataset.loc[dataset['photos_for_review'] == 'L', 'photos_for_review' ] = '0'
    # finally we can convert the photos_for_review column to an int format
dataset['photos_for_review'] = dataset['photos_for_review'].astype('int')

#for the photos_for_review column, 
    # when there's no info about the expensiveness we set it to -1
dataset.loc[dataset['restaurant_expensiveness'] == 'N/C', 'restaurant_expensiveness']  = -1
    # we can convert the restaurant_expensiveness column to an int format
dataset['restaurant_expensiveness'] = dataset['restaurant_expensiveness'].astype('int')

# change is real review for is fake review as it's better for sklearn 
dataset["is_fake_review"] = dataset["is_real_review"].apply(lambda x: '1' if x == 0 else '0')
dataset["is_fake_review"] = dataset["is_fake_review"].astype(int)
dataset = dataset.drop(columns="is_real_review")

# reset index 
dataset = dataset.reset_index(drop = True)


In [6]:
french_reviews = dataset.loc[dataset['language'] =='fr',['text_review', 'is_fake_review']].reset_index(drop=True)

In [64]:
french_reviews

Unnamed: 0,text_review,is_fake_review
0,Bon retour !\nJe suis revenue dans ce resto ap...,0
1,A optimiser...\nCuisine très traditionnelle da...,0
2,Brasserie chic\nUne brasserie authentiquement ...,0
3,Tres bien\nPetit diner entre amis. Les plats e...,0
4,Un bistrot bien sympathique\nNous avons mangé ...,0
...,...,...
90592,"Du choix, un service extrêmement rapide, le re...",1
90593,"Vraiment un des meilleur kebab du coin, servic...",1
90594,Très déçu!!!\nCe soir j'ai eu envie de manger ...,1
90595,J'y vais depuis le début mais j'avoue qu'avec ...,1


# Preprocessing for NLP

In [7]:
import pandas as pd
import numpy as np 
import spacy
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import fr_core_news_md

In [8]:
data = french_reviews.copy()

In [9]:

# strip
data["text_review_clean"] = data["text_review"].str.strip()

#lower
data["text_review_clean"] = data["text_review_clean"].str.lower()

data["text_review_clean"] = data["text_review_clean"].str.replace('\n', ' ')

# clean html
data["text_review_clean"] = data["text_review_clean"].str.replace(r"<[a-z/]+>", " ")

# keep only text and apostrophes 
data["text_review_clean"] = data["text_review_clean"].str.replace(r"[^A-zÀ-ÿ0-9' ]+", " ").astype(str)

# remove accents
from unidecode import unidecode 
data['text_review_clean'] = data['text_review_clean'].apply(lambda x : unidecode(str(x)))


# Tokenizing, lemmatizing and deleteing stopwords with Spacy


In [10]:
# create nlp instance
nlp =  fr_core_news_md.load()

   ### Creating Common Words list (optional)

In [11]:
# first let's find the count of all words and return them in the form of dict items
from collections import Counter

word_count = Counter(' '.join(data["text_review_clean"]).split()).items() #
print(len(word_count))

82648


In [12]:
# create df with all words and their count
word_count = pd.DataFrame({'word': [item[0] for item in list(word_count)], 
             'count' : [item[1] for item in list (word_count)]})

# format
word_count = word_count.sort_values('count', ascending = False)

In [13]:
# take all words that occur more than 500 times
commonwords = word_count.loc[word_count["count"]>=2000, :]
commonwords

Unnamed: 0,word,count
12,de,265651
29,et,225477
17,le,194982
19,a,193273
21,la,183396
...,...,...
1704,original,2007
549,samedi,2007
1029,gens,2004
4911,ingredients,2001


In [14]:

# lemmatize common words 
commonwords["word"] = commonwords["word"].apply(lambda x: nlp(x))
commonwords["word"] = commonwords["word"].apply(lambda x: [token.lemma_ for token in x])

# join
commonwords["word"] = commonwords["word"].str.join("")

commonwords


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  commonwords["word"] = commonwords["word"].apply(lambda x: nlp(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  commonwords["word"] = commonwords["word"].apply(lambda x: [token.lemma_ for token in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  commonwords["word"] = commonwords["word"].str.join

Unnamed: 0,word,count
12,de,265651
29,et,225477
17,le,194982
19,avoir,193273
21,le,183396
...,...,...
1704,original,2007
549,samedi,2007
1029,gens,2004
4911,ingredient,2001


In [17]:

# make list
common_words = commonwords.word
common_words


# append to stopwords 
from spacy.lang.fr.stop_words import STOP_WORDS
print(len(STOP_WORDS))
STOP_WORDS_MAX = STOP_WORDS.union(common_words)

# also add the lemmatizer for pronouns as we won't need them
STOP_WORDS_MAX.add("-PRON-")
print(len(STOP_WORDS_MAX))

600
885


### Tokenizing and Lemmatizing

In [15]:

#  apply nlp to transform into doc
data["clean_tokens"] = data["text_review_clean"].apply(lambda x: nlp(x))
data.head(5)

Unnamed: 0,text_review,is_fake_review,text_review_clean,clean_tokens
0,Bon retour !\nJe suis revenue dans ce resto ap...,0,bon retour je suis revenue dans ce resto apr...,"(bon, retour, , je, suis, revenue, dans, ce,..."
1,A optimiser...\nCuisine très traditionnelle da...,0,a optimiser cuisine tres traditionnelle dans ...,"(a, optimiser, , cuisine, tres, traditionnell..."
2,Brasserie chic\nUne brasserie authentiquement ...,0,brasserie chic une brasserie authentiquement p...,"(brasserie, chic, une, brasserie, authentiquem..."
3,Tres bien\nPetit diner entre amis. Les plats e...,0,tres bien petit diner entre amis les plats et...,"(tres, bien, petit, diner, entre, amis, , les..."
4,Un bistrot bien sympathique\nNous avons mangé ...,0,un bistrot bien sympathique nous avons mange e...,"(un, bistrot, bien, sympathique, nous, avons, ..."


In [18]:
# lemmatize each token and remove stop words --> could be done in two steps but we do it in one
data['clean_tokens_lemmatized'] = data['clean_tokens'].apply(lambda doc: [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS])
data.head(5)

### join all of them into new df column
data["clean_review"] = data["clean_tokens_lemmatized"].str.join(" ")

In [19]:

print(data.shape)
data.sample(5)

(90597, 6)


Unnamed: 0,text_review,is_fake_review,text_review_clean,clean_tokens,clean_tokens_lemmatized,clean_review
82944,Un Régal!!!!,1,un regal,"(un, regal)",[regal],regal
51675,Une grande découverte il y a deux ou trois ans...,0,une grande decouverte il y a deux ou trois ans...,"(une, grande, decouverte, il, y, a, deux, ou, ...","[grand, decouverte, y, an, , aller, repeter,...",grand decouverte y an aller repeter tre...
88290,Excellent bô bun frais ! Très bonne maison qui...,1,excellent bo bun frais tres bonne maison qui...,"(excellent, bo, bun, frais, , tres, bonne, m...","[excellent, bo, bun, frais, , tre, bon, mais...",excellent bo bun frais tre bon maison 20 an
41738,"Un régal!Le prix m'a un peu calmé, (15€ pour u...",0,un regal le prix m'a un peu calme 15 pour u...,"(un, regal, le, prix, m', a, un, peu, calme, ...","[regal, prix, calme, , 15, , entree, , del...",regal prix calme 15 entree delecte pers...
80576,"Une pizzeria, une vraie ! Des pizzas faites av...",1,une pizzeria une vraie des pizzas faites av...,"(une, pizzeria, , une, vraie, , des, pizzas...","[pizzeria, , vrai, , pizza, faire, produit,...",pizzeria vrai pizza faire produit frais ...


# Creating a TFIDF Matrix


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [21]:
# apply vectorizer to the review column
vectorizer = TfidfVectorizer(smooth_idf=True, min_df=200)
X = vectorizer.fit_transform(data['clean_review'])

In [22]:
len(vectorizer.vocabulary_)

2022

In [23]:
# transform this sparse matrix into a numpy array 
X_dense = X.toarray()
print(X_dense.shape)

# Let's put the matrix into a DF with the feature name (ie word) as column title and the document number as ID
# this is easily doable because the get_feature_names method of vectorizer returns the feature names 
# with the same index as their values in the X_dense matrix
X_df = pd.DataFrame(X_dense, 
             columns=[x for x in vectorizer.get_feature_names()], 
             index=["review_{}".format(i) for i in range (1,len(data)+1)])

(90597, 2022)


In [24]:
print(X_df.shape)
X_df

(90597, 2022)


Unnamed: 0,10,100,11,12,13,13em,13h,14,15,16,...,week,weekend,wifi,wok,yaourt,yelp,yelpeur,yer,york,zen
review_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
review_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
review_3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
review_4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
review_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
review_90593,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
review_90594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
review_90595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
review_90596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Topic Extraction

In [25]:
# import from sklearn
from sklearn.decomposition import TruncatedSVD

In [26]:
# set it to N different topics 
svd = TruncatedSVD(n_components= 100)

# fit to our matrix --> last two columns are those with the previous cluster_values
lsa = svd.fit_transform(X_df)

In [27]:
print(svd.explained_variance_ratio_.sum())

0.2590909216892909


In [28]:
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_{}".format(i) \
                                                for i in range(1,(lsa.shape[1]+1))]\
                               )
topic_encoded_df.head()
print(topic_encoded_df.shape)

(90597, 100)


# Preprocess  Data For Model

### Transformations and some featuring engineering

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [127]:
# make copy just in case
data_cl = topic_encoded_df.copy()

In [128]:
dataset

Unnamed: 0,date,username,photos_for_review,rating,text_review,user_location,user_friends_count,user_reviews_count,user_total_image_posted,restaurant_average_rating,restaurant_reviews_count,restaurant_expensiveness,restaurant_name,url,url_not_recommended,language,text_length,is_fake_review
0,2007-08-21,Not_Yelp_User,0,5.0,Bon retour !\nJe suis revenue dans ce resto ap...,"Levallois-Perret, Hauts-de-Seine",0.0,4.0,0.0,3.0,19.0,-1,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,https://www.yelp.fr/not_recommended_reviews/re...,fr,359,0
1,2006-07-26,Benjamin D.,0,3.0,A optimiser...\nCuisine très traditionnelle da...,"Bron, Rhône",0.0,22.0,0.0,3.0,19.0,-1,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,https://www.yelp.fr/not_recommended_reviews/re...,fr,256,0
2,2004-10-14,Not_Yelp_User,0,3.0,Brasserie chic\nUne brasserie authentiquement ...,Marseille,11.0,155.0,0.0,3.0,19.0,-1,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,https://www.yelp.fr/not_recommended_reviews/re...,fr,323,0
3,2007-11-02,Not_Yelp_User,0,4.0,Tres bien\nPetit diner entre amis. Les plats e...,"Boulogne-Billancourt, Hauts-de-Seine",0.0,10.0,0.0,3.0,19.0,-1,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,https://www.yelp.fr/not_recommended_reviews/re...,fr,247,0
4,2012-04-25,Not_Yelp_User,0,4.0,Un bistrot bien sympathique\nNous avons mangé ...,Montpellier,0.0,2.0,0.0,3.0,19.0,-1,restaurant anatole,https://www.yelp.fr/biz/restaurant-anatole-lev...,https://www.yelp.fr/not_recommended_reviews/re...,fr,280,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96243,2014-03-04,Biz N.,0,5.0,"Du choix, un service extrêmement rapide, le re...","Franconville, Val-d'Oise",0.0,2.0,0.0,5.0,6.0,1,Good Time,https://www.yelp.fr/biz/good-time-montigny-l%C...,https://www.yelp.fr/not_recommended_reviews/go...,fr,195,1
96244,2010-08-23,Not Yelp User,0,5.0,"Vraiment un des meilleur kebab du coin, servic...","Bezons, Val-d'Oise",1.0,4.0,0.0,5.0,6.0,1,Good Time,https://www.yelp.fr/biz/good-time-montigny-l%C...,https://www.yelp.fr/not_recommended_reviews/go...,fr,83,1
96245,2016-03-11,Marissa S.,0,1.0,Très déçu!!!\nCe soir j'ai eu envie de manger ...,"Sannois, Val-d'Oise",0.0,1.0,0.0,3.0,1.0,-1,Restaurant Istanbul,https://www.yelp.fr/biz/restaurant-istanbul-fr...,https://www.yelp.fr/not_recommended_reviews/re...,fr,391,1
96246,2017-04-18,Costanovic G.,0,5.0,J'y vais depuis le début mais j'avoue qu'avec ...,"Franconville, Val-d'Oise",0.0,1.0,0.0,3.0,1.0,-1,Restaurant Istanbul,https://www.yelp.fr/biz/restaurant-istanbul-fr...,https://www.yelp.fr/not_recommended_reviews/re...,fr,315,1


In [129]:
data

Unnamed: 0,text_review,is_fake_review,text_review_clean,clean_tokens,clean_tokens_lemmatized,clean_review,len_review,upper_word_count,exclam_count
0,Bon retour !\nJe suis revenue dans ce resto ap...,0,bon retour je suis revenue dans ce resto apr...,"(bon, retour, , je, suis, revenue, dans, ce,...","[bon, , revenir, restaurant, apre, long, abs...",bon revenir restaurant apre long absence 4 ...,359,0,2
1,A optimiser...\nCuisine très traditionnelle da...,0,a optimiser cuisine tres traditionnelle dans ...,"(a, optimiser, , cuisine, tres, traditionnell...","[optimiser, , cuisine, tre, traditionnel, dec...",optimiser cuisine tre traditionnel decor bra...,256,1,1
2,Brasserie chic\nUne brasserie authentiquement ...,0,brasserie chic une brasserie authentiquement p...,"(brasserie, chic, une, brasserie, authentiquem...","[brasserie, chic, brasserie, authentiquement, ...",brasserie chic brasserie authentiquement paris...,323,0,1
3,Tres bien\nPetit diner entre amis. Les plats e...,0,tres bien petit diner entre amis les plats et...,"(tres, bien, petit, diner, entre, amis, , les...","[tre, petit, diner, ami, , plat, bon, , vin,...",tre petit diner ami plat bon vin excellent...,247,1,0
4,Un bistrot bien sympathique\nNous avons mangé ...,0,un bistrot bien sympathique nous avons mange e...,"(un, bistrot, bien, sympathique, nous, avons, ...","[bistrot, sympathique, manger, famille, restau...",bistrot sympathique manger famille restaurant ...,280,0,0
...,...,...,...,...,...,...,...,...,...
90592,"Du choix, un service extrêmement rapide, le re...",1,du choix un service extremement rapide le re...,"(du, choix, , un, service, extremement, rapid...","[choix, , service, extremement, rapide, , re...",choix service extremement rapide repas don...,135,0,1
90593,"Vraiment un des meilleur kebab du coin, servic...",1,vraiment un des meilleur kebab du coin servic...,"(vraiment, un, des, meilleur, kebab, du, coin,...","[vraiment, meilleur, kebab, coin, , service, ...",vraiment meilleur kebab coin service tre rap...,112,1,0
90594,Très déçu!!!\nCe soir j'ai eu envie de manger ...,1,tres decu ce soir j'ai eu envie de manger un ...,"(tres, decu, , ce, soir, j', ai, eu, envie, d...","[tre, decu, , soir, envie, manger, chicken, t...",tre decu soir envie manger chicken tika gran...,160,0,1
90595,J'y vais depuis le début mais j'avoue qu'avec ...,1,j'y vais depuis le debut mais j'avoue qu'avec ...,"(j', y, vais, depuis, le, debut, mais, j', avo...","[y, aller, debut, avoue, bon, viande, faire, m...",y aller debut avoue bon viande faire maison ...,103,0,1


In [155]:
# Some feature engineering on original DF

# find review length
data["len_review"] = dataset["text_review"].apply(lambda x : len(str(x)))

# create bins for uppercase
data['upper_word_count'] = dataset['text_review'].apply(lambda x : sum(map(str.isupper, x.split())) )
# data['upper_word_count'] = pd.qcut(data['upper_word_count'], 3, labels = ['low', 'mid', 'high'])

# create bin for exclamation marks 
data['exclam_count'] = dataset['text_review'].apply(lambda x : len(''.join(ch for ch in x if ch =='!')))
# data['exclam_count'] = pd.qcut(data['exclam_count'].rank(method = 'first'), 3, labels = ['low', 'high', 'very_high'])

In [156]:
data

Unnamed: 0,text_review,is_fake_review,text_review_clean,clean_tokens,clean_tokens_lemmatized,clean_review,len_review,upper_word_count,exclam_count
0,"Bon retour !\nJe suis revenue dans ce resto après une longue absence de 4 ans. Que dire ? Le chef a changé, la cuisine aussi mais elle reste toujours aussi bonne et fraîche. L'équipe est très jeune et sait préserver cette esprit dynamique sans excès de déconnade. J'ai aimé aussi leur brunch (même trop copieux à mon avis). Bref, j'y retournerai plus souvent !",0,bon retour je suis revenue dans ce resto apres une longue absence de 4 ans que dire le chef a change la cuisine aussi mais elle reste toujours aussi bonne et fraiche l'equipe est tres jeune et sait preserver cette esprit dynamique sans exces de deconnade j'ai aime aussi leur brunch meme trop copieux a mon avis bref j'y retournerai plus souvent,"(bon, retour, , je, suis, revenue, dans, ce, resto, apres, une, longue, absence, de, 4, ans, , que, dire, , le, chef, a, change, , la, cuisine, aussi, mais, elle, reste, toujours, aussi, bonne, et, fraiche, , l', equipe, est, tres, jeune, et, sait, preserver, cette, esprit, dynamique, sans, exces, de, deconnade, , j', ai, aime, aussi, leur, brunch, , meme, trop, copieux, a, mon, avis, , bref, , j', y, retournerai, plus, souvent, )","[bon, , revenir, restaurant, apre, long, absence, 4, an, , , chef, changer, , cuisine, bon, fraich, , equipe, jeune, savoir, preserver, esprit, dynamique, exce, deconnade, , aime, brunch, , copieux, avis, , bref, , y, retourner, ]",bon revenir restaurant apre long absence 4 an chef changer cuisine bon fraich equipe jeune savoir preserver esprit dynamique exce deconnade aime brunch copieux avis bref y retourner,359,0,2
1,"A optimiser...\nCuisine très traditionnelle dans un décor de brasserie agréable. Le service est soigné, les plats sont servis chauds et en quantité suffisante. Pas mal mais un gros défaut ne m'a pas permis d'apprécier le lieu : totalement fumeur et enfumé !",0,a optimiser cuisine tres traditionnelle dans un decor de brasserie agreable le service est soigne les plats sont servis chauds et en quantite suffisante pas mal mais un gros defaut ne m'a pas permis d'apprecier le lieu totalement fumeur et enfume,"(a, optimiser, , cuisine, tres, traditionnelle, dans, un, decor, de, brasserie, agreable, , le, service, est, soigne, , les, plats, sont, servis, chauds, et, en, quantite, suffisante, , pas, mal, mais, un, gros, defaut, ne, m', a, pas, permis, d', apprecier, le, lieu, , totalement, fumeur, et, enfume, )","[optimiser, , cuisine, tre, traditionnel, decor, brasserie, agreabl, , service, soigne, , plat, servir, chaud, quantite, , mal, gros, defaut, permettre, apprecier, lieu, , totalement, fumeur, enfume, ]",optimiser cuisine tre traditionnel decor brasserie agreabl service soigne plat servir chaud quantite mal gros defaut permettre apprecier lieu totalement fumeur enfume,256,1,1
2,"Brasserie chic\nUne brasserie authentiquement parisienne pour un repas sur le pouce mais néanmoins raffinée. De Marseille, je n'ai pas résisté à la bourride de rascasse. Le chef ne s'est pas trop mal défendu malgré les kilomètres... Service agréable et souriant. Une bonne adresse si on a la flemme d'aller affronter Paris !",0,brasserie chic une brasserie authentiquement parisienne pour un repas sur le pouce mais neanmoins raffinee de marseille je n'ai pas resiste a la bourride de rascasse le chef ne s'est pas trop mal defendu malgre les kilometres service agreable et souriant une bonne adresse si on a la flemme d'aller affronter paris,"(brasserie, chic, une, brasserie, authentiquement, parisienne, pour, un, repas, sur, le, pouce, mais, neanmoins, raffinee, , de, marseille, , je, n', ai, pas, resiste, a, la, bourride, de, rascasse, , le, chef, ne, s', est, pas, trop, mal, defendu, malgre, les, kilometres, , service, agreable, et, souriant, , une, bonne, adresse, si, on, a, la, flemme, d', aller, affronter, paris, )","[brasserie, chic, brasserie, authentiquement, parisien, repas, pouce, raffine, , marseille, , resiste, bourride, rascasse, , chef, mal, defendu, kilometre, , service, agreabl, sourire, , bon, adresse, flemme, aller, affronter, paris, ]",brasserie chic brasserie authentiquement parisien repas pouce raffine marseille resiste bourride rascasse chef mal defendu kilometre service agreabl sourire bon adresse flemme aller affronter paris,323,0,1
3,"Tres bien\nPetit diner entre amis. Les plats etaient bons, les vins excellents et la tarte de framboises exquise. Ce soir là c'etait assez calme pour une brasserie. Et moi j'adore le cadre, on se croirait dans un films des annees 50. A recommander.",0,tres bien petit diner entre amis les plats etaient bons les vins excellents et la tarte de framboises exquise ce soir la c'etait assez calme pour une brasserie et moi j'adore le cadre on se croirait dans un films des annees 50 a recommander,"(tres, bien, petit, diner, entre, amis, , les, plats, etaient, bons, , les, vins, excellents, et, la, tarte, de, framboises, exquise, , ce, soir, la, c', etait, assez, calme, pour, une, brasserie, , et, moi, j', adore, le, cadre, , on, se, croirait, dans, un, films, des, annees, 50, , a, recommander)","[tre, petit, diner, ami, , plat, bon, , vin, excellent, tarte, framboise, exquis, , soir, calme, brasserie, , adore, cadre, , croire, film, annee, 50, , recommander]",tre petit diner ami plat bon vin excellent tarte framboise exquis soir calme brasserie adore cadre croire film annee 50 recommander,247,1,0
4,"Un bistrot bien sympathique\nNous avons mangé en famille dans ce restaurant de type bistrot, qui est très sympathique. Le repas était très bon et le service fort aimable. L'addition est raisonnable, je conseille ce restaurant où vous mangerez bien pour un prix tout à fait correct.",0,un bistrot bien sympathique nous avons mange en famille dans ce restaurant de type bistrot qui est tres sympathique le repas etait tres bon et le service fort aimable l'addition est raisonnable je conseille ce restaurant ou vous mangerez bien pour un prix tout a fait correct,"(un, bistrot, bien, sympathique, nous, avons, mange, en, famille, dans, ce, restaurant, de, type, bistrot, , qui, est, tres, sympathique, , le, repas, etait, tres, bon, et, le, service, fort, aimable, , l', addition, est, raisonnable, , je, conseille, ce, restaurant, ou, vous, mangerez, bien, pour, un, prix, tout, a, fait, correct)","[bistrot, sympathique, manger, famille, restaurant, type, bistrot, , sympathique, , repas, bon, service, fort, aimable, , addition, raisonnable, , conseille, restaurant, manger, prix, faire, correct]",bistrot sympathique manger famille restaurant type bistrot sympathique repas bon service fort aimable addition raisonnable conseille restaurant manger prix faire correct,280,0,0
...,...,...,...,...,...,...,...,...,...
90592,"Du choix, un service extrêmement rapide, le repas donne vraiment envie et je me régale a chaque fois !\nSi vous devez tester le grec du coin passer d'abord par celui la, le meilleur des alentours!",1,du choix un service extremement rapide le repas donne vraiment envie et je me regale a chaque fois si vous devez tester le grec du coin passer d'abord par celui la le meilleur des alentours,"(du, choix, , un, service, extremement, rapide, , le, repas, donne, vraiment, envie, et, je, me, regale, a, chaque, fois, , si, vous, devez, tester, le, grec, du, coin, passer, d', abord, par, celui, la, , le, meilleur, des, alentours)","[choix, , service, extremement, rapide, , repas, donne, vraiment, envie, regale, fois, , devoir, tester, grec, coin, passer, , meilleur, alentours]",choix service extremement rapide repas donne vraiment envie regale fois devoir tester grec coin passer meilleur alentours,135,0,1
90593,"Vraiment un des meilleur kebab du coin, service très rapide et surtout excellent !!",1,vraiment un des meilleur kebab du coin service tres rapide et surtout excellent,"(vraiment, un, des, meilleur, kebab, du, coin, , service, tres, rapide, et, surtout, excellent, )","[vraiment, meilleur, kebab, coin, , service, tre, rapide, excellent, ]",vraiment meilleur kebab coin service tre rapide excellent,112,1,0
90594,"Très déçu!!!\nCe soir j'ai eu envie de manger un chicken tika avec une grande envie (vu bébé en route ) ben en le mangeant j'ai trouver un cheveu, et sur c'est pas le mien car j'ai des cheveux afro, donc j'ai apeler pour expliquer ma découverte il m'ont répondu juste je suis désoler et il ont raccroché. Je l'ai jetter dirrect, deplus ils n'utilise pas de gants c'est pas très hygiène voilà",1,tres decu ce soir j'ai eu envie de manger un chicken tika avec une grande envie vu bebe en route ben en le mangeant j'ai trouver un cheveu et sur c'est pas le mien car j'ai des cheveux afro donc j'ai apeler pour expliquer ma decouverte il m'ont repondu juste je suis desoler et il ont raccroche je l'ai jetter dirrect deplus ils n'utilise pas de gants c'est pas tres hygiene voila,"(tres, decu, , ce, soir, j', ai, eu, envie, de, manger, un, chicken, tika, avec, une, grande, envie, , vu, bebe, en, route, , ben, en, le, mangeant, j', ai, trouver, un, cheveu, , et, sur, c', est, pas, le, mien, car, j', ai, des, cheveux, afro, , donc, j', ai, apeler, pour, expliquer, ma, decouverte, il, m', ont, repondu, juste, je, suis, desoler, et, il, ont, raccroche, , je, l', ai, jetter, dirrect, , deplus, ils, n', utilise, pas, de, gants, c', est, pas, tres, hygiene, voila)","[tre, decu, , soir, envie, manger, chicken, tika, grand, envie, , voir, bebe, route, , ben, mangeant, trouver, cheveu, , cheveu, afro, , apeler, expliquer, decouverte, repondu, desoler, raccroche, , jetter, dirrect, , deplu, utilise, gant, tre, hygiene, voila]",tre decu soir envie manger chicken tika grand envie voir bebe route ben mangeant trouver cheveu cheveu afro apeler expliquer decouverte repondu desoler raccroche jetter dirrect deplu utilise gant tre hygiene voila,160,0,1
90595,"J'y vais depuis le début mais j'avoue qu'avec désormais la bonne viande faite maison , il est encore meilleur. Je prends des assiettes grecs et sandwich tortillas grec c'est magnifique. Je vais la-bas car ils sont sympathiques et c'est très propre pas le genre de grec bizarre ou tu peux être malade comme ailleurs.",1,j'y vais depuis le debut mais j'avoue qu'avec desormais la bonne viande faite maison il est encore meilleur je prends des assiettes grecs et sandwich tortillas grec c'est magnifique je vais la bas car ils sont sympathiques et c'est tres propre pas le genre de grec bizarre ou tu peux etre malade comme ailleurs,"(j', y, vais, depuis, le, debut, mais, j', avoue, qu', avec, desormais, la, bonne, viande, faite, maison, , il, est, encore, meilleur, , je, prends, des, assiettes, grecs, et, sandwich, tortillas, grec, c', est, magnifique, , je, vais, la, bas, car, ils, sont, sympathiques, et, c', est, tres, propre, pas, le, genre, de, grec, bizarre, ou, tu, peux, etre, malade, comme, ailleurs)","[y, aller, debut, avoue, bon, viande, faire, maison, , meilleur, , prendre, assiette, grec, sandwich, tortillas, grec, magnifique, , aller, sympathique, propre, genre, grec, bizarre, pouvoir, malade]",y aller debut avoue bon viande faire maison meilleur prendre assiette grec sandwich tortillas grec magnifique aller sympathique propre genre grec bizarre pouvoir malade,103,0,1


In [159]:
data['exclam_count'] = data['exclam_count'].apply(lambda x : set_chatacter_cat(x)) 


In [157]:
data['upper_word_count'] = data['upper_word_count'].apply(lambda x : set_chatacter_cat(x)) 


In [160]:
data.groupby("exclam_count")

Unnamed: 0_level_0,text_review,is_fake_review,text_review_clean,clean_tokens,clean_tokens_lemmatized,clean_review,len_review,upper_word_count
exclam_count,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
high,4892,4892,4892,4892,4892,4892,4892,4892
medium,45959,45959,45959,45959,45959,45959,45959,45959
none,39746,39746,39746,39746,39746,39746,39746,39746


In [152]:
def set_chatacter_cat(text): 
    if text == 0: 
        exclam = "none"
    elif text > 5: 
        exclam = "high"
    else: 
        exclam = "medium"
    return exclam

In [132]:
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


In [133]:
data

Unnamed: 0,text_review,is_fake_review,text_review_clean,clean_tokens,clean_tokens_lemmatized,clean_review,len_review,upper_word_count,exclam_count
0,"Bon retour !\nJe suis revenue dans ce resto après une longue absence de 4 ans. Que dire ? Le chef a changé, la cuisine aussi mais elle reste toujours aussi bonne et fraîche. L'équipe est très jeune et sait préserver cette esprit dynamique sans excès de déconnade. J'ai aimé aussi leur brunch (même trop copieux à mon avis). Bref, j'y retournerai plus souvent !",0,bon retour je suis revenue dans ce resto apres une longue absence de 4 ans que dire le chef a change la cuisine aussi mais elle reste toujours aussi bonne et fraiche l'equipe est tres jeune et sait preserver cette esprit dynamique sans exces de deconnade j'ai aime aussi leur brunch meme trop copieux a mon avis bref j'y retournerai plus souvent,"(bon, retour, , je, suis, revenue, dans, ce, resto, apres, une, longue, absence, de, 4, ans, , que, dire, , le, chef, a, change, , la, cuisine, aussi, mais, elle, reste, toujours, aussi, bonne, et, fraiche, , l', equipe, est, tres, jeune, et, sait, preserver, cette, esprit, dynamique, sans, exces, de, deconnade, , j', ai, aime, aussi, leur, brunch, , meme, trop, copieux, a, mon, avis, , bref, , j', y, retournerai, plus, souvent, )","[bon, , revenir, restaurant, apre, long, absence, 4, an, , , chef, changer, , cuisine, bon, fraich, , equipe, jeune, savoir, preserver, esprit, dynamique, exce, deconnade, , aime, brunch, , copieux, avis, , bref, , y, retourner, ]",bon revenir restaurant apre long absence 4 an chef changer cuisine bon fraich equipe jeune savoir preserver esprit dynamique exce deconnade aime brunch copieux avis bref y retourner,359,low,high
1,"A optimiser...\nCuisine très traditionnelle dans un décor de brasserie agréable. Le service est soigné, les plats sont servis chauds et en quantité suffisante. Pas mal mais un gros défaut ne m'a pas permis d'apprécier le lieu : totalement fumeur et enfumé !",0,a optimiser cuisine tres traditionnelle dans un decor de brasserie agreable le service est soigne les plats sont servis chauds et en quantite suffisante pas mal mais un gros defaut ne m'a pas permis d'apprecier le lieu totalement fumeur et enfume,"(a, optimiser, , cuisine, tres, traditionnelle, dans, un, decor, de, brasserie, agreable, , le, service, est, soigne, , les, plats, sont, servis, chauds, et, en, quantite, suffisante, , pas, mal, mais, un, gros, defaut, ne, m', a, pas, permis, d', apprecier, le, lieu, , totalement, fumeur, et, enfume, )","[optimiser, , cuisine, tre, traditionnel, decor, brasserie, agreabl, , service, soigne, , plat, servir, chaud, quantite, , mal, gros, defaut, permettre, apprecier, lieu, , totalement, fumeur, enfume, ]",optimiser cuisine tre traditionnel decor brasserie agreabl service soigne plat servir chaud quantite mal gros defaut permettre apprecier lieu totalement fumeur enfume,256,high,high
2,"Brasserie chic\nUne brasserie authentiquement parisienne pour un repas sur le pouce mais néanmoins raffinée. De Marseille, je n'ai pas résisté à la bourride de rascasse. Le chef ne s'est pas trop mal défendu malgré les kilomètres... Service agréable et souriant. Une bonne adresse si on a la flemme d'aller affronter Paris !",0,brasserie chic une brasserie authentiquement parisienne pour un repas sur le pouce mais neanmoins raffinee de marseille je n'ai pas resiste a la bourride de rascasse le chef ne s'est pas trop mal defendu malgre les kilometres service agreable et souriant une bonne adresse si on a la flemme d'aller affronter paris,"(brasserie, chic, une, brasserie, authentiquement, parisienne, pour, un, repas, sur, le, pouce, mais, neanmoins, raffinee, , de, marseille, , je, n', ai, pas, resiste, a, la, bourride, de, rascasse, , le, chef, ne, s', est, pas, trop, mal, defendu, malgre, les, kilometres, , service, agreable, et, souriant, , une, bonne, adresse, si, on, a, la, flemme, d', aller, affronter, paris, )","[brasserie, chic, brasserie, authentiquement, parisien, repas, pouce, raffine, , marseille, , resiste, bourride, rascasse, , chef, mal, defendu, kilometre, , service, agreabl, sourire, , bon, adresse, flemme, aller, affronter, paris, ]",brasserie chic brasserie authentiquement parisien repas pouce raffine marseille resiste bourride rascasse chef mal defendu kilometre service agreabl sourire bon adresse flemme aller affronter paris,323,low,high
3,"Tres bien\nPetit diner entre amis. Les plats etaient bons, les vins excellents et la tarte de framboises exquise. Ce soir là c'etait assez calme pour une brasserie. Et moi j'adore le cadre, on se croirait dans un films des annees 50. A recommander.",0,tres bien petit diner entre amis les plats etaient bons les vins excellents et la tarte de framboises exquise ce soir la c'etait assez calme pour une brasserie et moi j'adore le cadre on se croirait dans un films des annees 50 a recommander,"(tres, bien, petit, diner, entre, amis, , les, plats, etaient, bons, , les, vins, excellents, et, la, tarte, de, framboises, exquise, , ce, soir, la, c', etait, assez, calme, pour, une, brasserie, , et, moi, j', adore, le, cadre, , on, se, croirait, dans, un, films, des, annees, 50, , a, recommander)","[tre, petit, diner, ami, , plat, bon, , vin, excellent, tarte, framboise, exquis, , soir, calme, brasserie, , adore, cadre, , croire, film, annee, 50, , recommander]",tre petit diner ami plat bon vin excellent tarte framboise exquis soir calme brasserie adore cadre croire film annee 50 recommander,247,high,low
4,"Un bistrot bien sympathique\nNous avons mangé en famille dans ce restaurant de type bistrot, qui est très sympathique. Le repas était très bon et le service fort aimable. L'addition est raisonnable, je conseille ce restaurant où vous mangerez bien pour un prix tout à fait correct.",0,un bistrot bien sympathique nous avons mange en famille dans ce restaurant de type bistrot qui est tres sympathique le repas etait tres bon et le service fort aimable l'addition est raisonnable je conseille ce restaurant ou vous mangerez bien pour un prix tout a fait correct,"(un, bistrot, bien, sympathique, nous, avons, mange, en, famille, dans, ce, restaurant, de, type, bistrot, , qui, est, tres, sympathique, , le, repas, etait, tres, bon, et, le, service, fort, aimable, , l', addition, est, raisonnable, , je, conseille, ce, restaurant, ou, vous, mangerez, bien, pour, un, prix, tout, a, fait, correct)","[bistrot, sympathique, manger, famille, restaurant, type, bistrot, , sympathique, , repas, bon, service, fort, aimable, , addition, raisonnable, , conseille, restaurant, manger, prix, faire, correct]",bistrot sympathique manger famille restaurant type bistrot sympathique repas bon service fort aimable addition raisonnable conseille restaurant manger prix faire correct,280,low,low
...,...,...,...,...,...,...,...,...,...
90592,"Du choix, un service extrêmement rapide, le repas donne vraiment envie et je me régale a chaque fois !\nSi vous devez tester le grec du coin passer d'abord par celui la, le meilleur des alentours!",1,du choix un service extremement rapide le repas donne vraiment envie et je me regale a chaque fois si vous devez tester le grec du coin passer d'abord par celui la le meilleur des alentours,"(du, choix, , un, service, extremement, rapide, , le, repas, donne, vraiment, envie, et, je, me, regale, a, chaque, fois, , si, vous, devez, tester, le, grec, du, coin, passer, d', abord, par, celui, la, , le, meilleur, des, alentours)","[choix, , service, extremement, rapide, , repas, donne, vraiment, envie, regale, fois, , devoir, tester, grec, coin, passer, , meilleur, alentours]",choix service extremement rapide repas donne vraiment envie regale fois devoir tester grec coin passer meilleur alentours,135,high,high
90593,"Vraiment un des meilleur kebab du coin, service très rapide et surtout excellent !!",1,vraiment un des meilleur kebab du coin service tres rapide et surtout excellent,"(vraiment, un, des, meilleur, kebab, du, coin, , service, tres, rapide, et, surtout, excellent, )","[vraiment, meilleur, kebab, coin, , service, tre, rapide, excellent, ]",vraiment meilleur kebab coin service tre rapide excellent,112,high,high
90594,"Très déçu!!!\nCe soir j'ai eu envie de manger un chicken tika avec une grande envie (vu bébé en route ) ben en le mangeant j'ai trouver un cheveu, et sur c'est pas le mien car j'ai des cheveux afro, donc j'ai apeler pour expliquer ma découverte il m'ont répondu juste je suis désoler et il ont raccroché. Je l'ai jetter dirrect, deplus ils n'utilise pas de gants c'est pas très hygiène voilà",1,tres decu ce soir j'ai eu envie de manger un chicken tika avec une grande envie vu bebe en route ben en le mangeant j'ai trouver un cheveu et sur c'est pas le mien car j'ai des cheveux afro donc j'ai apeler pour expliquer ma decouverte il m'ont repondu juste je suis desoler et il ont raccroche je l'ai jetter dirrect deplus ils n'utilise pas de gants c'est pas tres hygiene voila,"(tres, decu, , ce, soir, j', ai, eu, envie, de, manger, un, chicken, tika, avec, une, grande, envie, , vu, bebe, en, route, , ben, en, le, mangeant, j', ai, trouver, un, cheveu, , et, sur, c', est, pas, le, mien, car, j', ai, des, cheveux, afro, , donc, j', ai, apeler, pour, expliquer, ma, decouverte, il, m', ont, repondu, juste, je, suis, desoler, et, il, ont, raccroche, , je, l', ai, jetter, dirrect, , deplus, ils, n', utilise, pas, de, gants, c', est, pas, tres, hygiene, voila)","[tre, decu, , soir, envie, manger, chicken, tika, grand, envie, , voir, bebe, route, , ben, mangeant, trouver, cheveu, , cheveu, afro, , apeler, expliquer, decouverte, repondu, desoler, raccroche, , jetter, dirrect, , deplu, utilise, gant, tre, hygiene, voila]",tre decu soir envie manger chicken tika grand envie voir bebe route ben mangeant trouver cheveu cheveu afro apeler expliquer decouverte repondu desoler raccroche jetter dirrect deplu utilise gant tre hygiene voila,160,high,high
90595,"J'y vais depuis le début mais j'avoue qu'avec désormais la bonne viande faite maison , il est encore meilleur. Je prends des assiettes grecs et sandwich tortillas grec c'est magnifique. Je vais la-bas car ils sont sympathiques et c'est très propre pas le genre de grec bizarre ou tu peux être malade comme ailleurs.",1,j'y vais depuis le debut mais j'avoue qu'avec desormais la bonne viande faite maison il est encore meilleur je prends des assiettes grecs et sandwich tortillas grec c'est magnifique je vais la bas car ils sont sympathiques et c'est tres propre pas le genre de grec bizarre ou tu peux etre malade comme ailleurs,"(j', y, vais, depuis, le, debut, mais, j', avoue, qu', avec, desormais, la, bonne, viande, faite, maison, , il, est, encore, meilleur, , je, prends, des, assiettes, grecs, et, sandwich, tortillas, grec, c', est, magnifique, , je, vais, la, bas, car, ils, sont, sympathiques, et, c', est, tres, propre, pas, le, genre, de, grec, bizarre, ou, tu, peux, etre, malade, comme, ailleurs)","[y, aller, debut, avoue, bon, viande, faire, maison, , meilleur, , prendre, assiette, grec, sandwich, tortillas, grec, magnifique, , aller, sympathique, propre, genre, grec, bizarre, pouvoir, malade]",y aller debut avoue bon viande faire maison meilleur prendre assiette grec sandwich tortillas grec magnifique aller sympathique propre genre grec bizarre pouvoir malade,103,high,high


In [107]:
data

Unnamed: 0,text_review,is_fake_review,text_review_clean,clean_tokens,clean_tokens_lemmatized,clean_review,len_review,upper_word_count,exclam_count
0,Bon retour !\nJe suis revenue dans ce resto ap...,0,bon retour je suis revenue dans ce resto apr...,"(bon, retour, , je, suis, revenue, dans, ce,...","[bon, , revenir, restaurant, apre, long, abs...",bon revenir restaurant apre long absence 4 ...,359,0,2
1,A optimiser...\nCuisine très traditionnelle da...,0,a optimiser cuisine tres traditionnelle dans ...,"(a, optimiser, , cuisine, tres, traditionnell...","[optimiser, , cuisine, tre, traditionnel, dec...",optimiser cuisine tre traditionnel decor bra...,256,1,1
2,Brasserie chic\nUne brasserie authentiquement ...,0,brasserie chic une brasserie authentiquement p...,"(brasserie, chic, une, brasserie, authentiquem...","[brasserie, chic, brasserie, authentiquement, ...",brasserie chic brasserie authentiquement paris...,323,0,1
3,Tres bien\nPetit diner entre amis. Les plats e...,0,tres bien petit diner entre amis les plats et...,"(tres, bien, petit, diner, entre, amis, , les...","[tre, petit, diner, ami, , plat, bon, , vin,...",tre petit diner ami plat bon vin excellent...,247,1,0
4,Un bistrot bien sympathique\nNous avons mangé ...,0,un bistrot bien sympathique nous avons mange e...,"(un, bistrot, bien, sympathique, nous, avons, ...","[bistrot, sympathique, manger, famille, restau...",bistrot sympathique manger famille restaurant ...,280,0,0
...,...,...,...,...,...,...,...,...,...
90592,"Du choix, un service extrêmement rapide, le re...",1,du choix un service extremement rapide le re...,"(du, choix, , un, service, extremement, rapid...","[choix, , service, extremement, rapide, , re...",choix service extremement rapide repas don...,135,0,1
90593,"Vraiment un des meilleur kebab du coin, servic...",1,vraiment un des meilleur kebab du coin servic...,"(vraiment, un, des, meilleur, kebab, du, coin,...","[vraiment, meilleur, kebab, coin, , service, ...",vraiment meilleur kebab coin service tre rap...,112,1,0
90594,Très déçu!!!\nCe soir j'ai eu envie de manger ...,1,tres decu ce soir j'ai eu envie de manger un ...,"(tres, decu, , ce, soir, j', ai, eu, envie, d...","[tre, decu, , soir, envie, manger, chicken, t...",tre decu soir envie manger chicken tika gran...,160,0,1
90595,J'y vais depuis le début mais j'avoue qu'avec ...,1,j'y vais depuis le debut mais j'avoue qu'avec ...,"(j', y, vais, depuis, le, debut, mais, j', avo...","[y, aller, debut, avoue, bon, viande, faire, m...",y aller debut avoue bon viande faire maison ...,103,0,1


In [161]:
# append all the categories to our tf-idf matrix 
data_cl["len_review"] = list(data.len_review)
data_cl["is_fake_review"] = list(data["is_fake_review"])
data_cl["upper_word_count"] = list(data["upper_word_count"])
data_cl["exclam_count"] = list(data["exclam_count"])



In [162]:
data_cl["len_review"] = data["len_review"]


In [163]:
data_cl = data_cl.sample(10000)

In [164]:
data_cl

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_95,topic_96,topic_97,topic_98,topic_99,topic_100,len_review,is_fake_review,upper_word_count,exclam_count
8415,0.074701,0.011109,-0.037199,-0.005990,0.025669,0.013250,0.028772,0.009016,0.023646,0.031752,...,0.013650,0.001141,-0.011476,-0.029906,0.031651,-0.010087,103,0,none,none
27777,0.100001,0.015737,-0.075269,0.052064,-0.083042,0.010859,-0.011609,-0.006864,0.029062,0.084533,...,0.014077,-0.016998,0.014309,0.012740,0.013375,-0.018000,998,0,medium,none
61827,0.275837,-0.039399,-0.120196,0.086001,-0.016761,-0.101077,-0.052385,0.063519,0.106874,0.052461,...,-0.077044,0.035724,0.026187,-0.018125,-0.007220,0.006648,1704,0,high,medium
86833,0.340262,0.031848,0.101503,-0.060527,-0.003124,-0.005017,-0.041101,0.008060,-0.007306,-0.008218,...,0.038799,0.029725,-0.003597,-0.014346,-0.048492,-0.057570,158,1,medium,medium
22416,0.338068,-0.041879,0.047774,-0.200866,-0.000534,-0.051510,0.012587,0.054317,0.043648,-0.053166,...,-0.032177,0.065091,0.045522,-0.042378,0.003485,-0.040730,654,0,none,none
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22149,0.146625,-0.032063,0.044605,0.096454,0.011575,-0.037546,-0.017004,-0.051259,-0.055722,-0.020158,...,-0.073500,-0.026083,-0.022831,0.015095,0.000279,0.043005,709,0,none,none
34318,0.311742,0.158409,0.078338,-0.157770,-0.120154,0.041275,-0.095101,0.020012,-0.008752,-0.059908,...,-0.034340,0.051493,0.056375,0.003218,-0.034885,0.059849,811,0,none,none
47119,0.240069,0.031590,-0.144682,0.125493,-0.078538,-0.044295,-0.092021,0.100827,0.040354,-0.027305,...,-0.033843,0.002031,0.021517,-0.027957,0.001387,-0.015544,262,0,none,none
37635,0.245651,0.081254,0.173986,0.042579,-0.025033,-0.013430,-0.125383,-0.033031,-0.054810,-0.005749,...,-0.009653,-0.074086,0.041353,0.014991,-0.015709,0.000338,133,0,none,none


In [None]:
french_reviews

In [95]:
data_cl.groupby("is_fake_review").count()

Unnamed: 0_level_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_94,topic_95,topic_96,topic_97,topic_98,topic_99,topic_100,len_review,upper_word_count,exclam_count
is_fake_review,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,8098,8098,8098,8098,8098,8098,8098,8098,8098,8098,...,8098,8098,8098,8098,8098,8098,8098,8098,8098,8098
1,1902,1902,1902,1902,1902,1902,1902,1902,1902,1902,...,1902,1902,1902,1902,1902,1902,1902,1902,1902,1902


In [172]:
#check 
print(data_cl.shape)
data_cl.head(3)

(10000, 104)


Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_95,topic_96,topic_97,topic_98,topic_99,topic_100,len_review,is_fake_review,upper_word_count,exclam_count
8415,0.074701,0.011109,-0.037199,-0.00599,0.025669,0.01325,0.028772,0.009016,0.023646,0.031752,...,0.01365,0.001141,-0.011476,-0.029906,0.031651,-0.010087,103,0,none,none
27777,0.100001,0.015737,-0.075269,0.052064,-0.083042,0.010859,-0.011609,-0.006864,0.029062,0.084533,...,0.014077,-0.016998,0.014309,0.01274,0.013375,-0.018,998,0,medium,none
61827,0.275837,-0.039399,-0.120196,0.086001,-0.016761,-0.101077,-0.052385,0.063519,0.106874,0.052461,...,-0.077044,0.035724,0.026187,-0.018125,-0.00722,0.006648,1704,0,high,medium


### Train_Test_Split and Preprocessing

In [166]:
# split X
X_cl = data_cl.drop(columns="is_fake_review")
X_cl.shape

# split y 
y = data_cl["is_fake_review"]

In [178]:
X_train, X_test, y_train, y_test = train_test_split(X_cl,y,
                                                    test_size = 0.2,
                                                    stratify = y , ## Statify splitting when you're training a classification model !
                                                    random_state = 19)

In [179]:
X_train.shape

(8000, 103)

In [180]:
# Create pipeline for categorical features

categorical_features = [index for index, c in enumerate(X_train.columns) if c in ['exclam_count', 'upper_word_count' ] ]
numerical_features = [index for index, c in enumerate(X_train.columns) if c not in ["exclam_count", 'upper_word_count' ] ] # Positions of categorical columns in X_train/X_test

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


categorical_transformer = Pipeline(
    steps=[
    ('encoder', OneHotEncoder(drop='first'))
    ])# first column will be dropped to avoid creating correlations between features

In [176]:
# Create pipeline for categorical features


numerical_features = [index for index, c in enumerate(X_train.columns)] # Positions of categorical columns in X_train/X_test

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])


In [181]:
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [182]:
# Preprocessings on train set

X_train= preprocessor.fit_transform(X_train)
print(X_train[0:5,:])

# Preprocessings on test set
X_test = preprocessor.transform(X_test) 
print(X_test[0:5,:])

print(X_train.shape)

[[-2.12223580e-01 -1.00133536e+00 -1.00034718e+00  2.89685382e-01
   9.31672268e-02  5.18304644e-02 -1.40886334e+00 -7.00613503e-01
   6.94115171e-02  1.00524939e+00  5.98269671e-01 -4.50346406e-01
   9.36499828e-01  1.08125097e+00  3.77063198e-02 -8.97181449e-02
  -1.73785653e-01  4.05699365e-01  1.52987667e-01 -4.94803171e-02
  -3.36541723e-01 -1.20816850e+00  1.51187886e-01 -1.91057384e-02
  -1.23783097e-01 -2.77245321e-01 -6.37630069e-01  4.69107052e-01
   1.38430783e+00  7.04625553e-01  3.74374655e-01  1.53697698e-01
  -9.66902502e-01 -1.47381883e+00 -9.83933378e-01 -3.30117981e-01
  -1.11580882e-01 -1.24566577e-02  1.29746987e+00 -8.05664017e-01
   4.44050447e-01  1.04256009e+00 -5.04107493e-01  3.47491682e-01
   4.11078814e-02 -5.43585143e-01 -9.72232875e-01 -5.13060242e-01
   1.00175266e+00  8.03741189e-01  1.08601459e+00  6.53485818e-01
  -1.75753083e-01  2.40658749e-01 -7.15364178e-01 -1.44600894e+00
  -1.55384676e+00  1.50986245e+00 -1.55081368e+00 -2.92763042e-01
   3.51494

# Fitting to Different Models

In [66]:
from sklearn.svm import SVC


In [69]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [68]:
from sklearn.model_selection import StratifiedKFold

In [189]:
kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 

parameters= {'C': [400, 500, 600] ,\
            'gamma': [0.0001, 0.00001, 0.000001] ,
            "class_weight": [{1:0.67, 0:0.33}, {1:0.75, 0:0.25}, {1:0.8, 0:0.2}] \
            }

model = SVC()
model_svc =GridSearchCV(model, parameters, cv=kfold, verbose=2, scoring="f1")
model_svc.fit(X_train,y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001 ............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001, total=   6.8s
[CV] C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001 ............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.8s remaining:    0.0s


[CV]  C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001, total=   5.8s
[CV] C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001 ............
[CV]  C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001, total=   5.7s
[CV] C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001 ............
[CV]  C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001, total=   5.7s
[CV] C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001 ............
[CV]  C=400, class_weight={1: 0.67, 0: 0.33}, gamma=0.0001, total=   5.8s
[CV] C=400, class_weight={1: 0.67, 0: 0.33}, gamma=1e-05 .............
[CV]  C=400, class_weight={1: 0.67, 0: 0.33}, gamma=1e-05, total=   5.3s
[CV] C=400, class_weight={1: 0.67, 0: 0.33}, gamma=1e-05 .............
[CV]  C=400, class_weight={1: 0.67, 0: 0.33}, gamma=1e-05, total=   5.2s
[CV] C=400, class_weight={1: 0.67, 0: 0.33}, gamma=1e-05 .............
[CV]  C=400, class_weight={1: 0.67, 0: 0.33}, gamma=1e-05, total=   5.2s
[CV] C=400, class_weight={1: 0.67, 0: 0.33}, gamma=1e-05 ..

[CV]  C=500, class_weight={1: 0.67, 0: 0.33}, gamma=1e-06, total=   5.6s
[CV] C=500, class_weight={1: 0.67, 0: 0.33}, gamma=1e-06 .............
[CV]  C=500, class_weight={1: 0.67, 0: 0.33}, gamma=1e-06, total=   6.1s
[CV] C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001 ............
[CV]  C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001, total=   6.8s
[CV] C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001 ............
[CV]  C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001, total=   7.4s
[CV] C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001 ............
[CV]  C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001, total=   8.4s
[CV] C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001 ............
[CV]  C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001, total=   7.7s
[CV] C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001 ............
[CV]  C=500, class_weight={1: 0.75, 0: 0.25}, gamma=0.0001, total=   6.5s
[CV] C=500, class_weight={1: 0.75, 0: 0.25}, gamma=1e-05 .

[CV]  C=600, class_weight={1: 0.75, 0: 0.25}, gamma=1e-06, total=   6.7s
[CV] C=600, class_weight={1: 0.75, 0: 0.25}, gamma=1e-06 .............
[CV]  C=600, class_weight={1: 0.75, 0: 0.25}, gamma=1e-06, total=   7.1s
[CV] C=600, class_weight={1: 0.75, 0: 0.25}, gamma=1e-06 .............
[CV]  C=600, class_weight={1: 0.75, 0: 0.25}, gamma=1e-06, total=   6.8s
[CV] C=600, class_weight={1: 0.75, 0: 0.25}, gamma=1e-06 .............
[CV]  C=600, class_weight={1: 0.75, 0: 0.25}, gamma=1e-06, total=   7.0s
[CV] C=600, class_weight={1: 0.75, 0: 0.25}, gamma=1e-06 .............
[CV]  C=600, class_weight={1: 0.75, 0: 0.25}, gamma=1e-06, total=   6.6s
[CV] C=600, class_weight={1: 0.8, 0: 0.2}, gamma=0.0001 ..............
[CV]  C=600, class_weight={1: 0.8, 0: 0.2}, gamma=0.0001, total=   7.1s
[CV] C=600, class_weight={1: 0.8, 0: 0.2}, gamma=0.0001 ..............
[CV]  C=600, class_weight={1: 0.8, 0: 0.2}, gamma=0.0001, total=   7.0s
[CV] C=600, class_weight={1: 0.8, 0: 0.2}, gamma=0.0001 .........

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed: 15.3min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=SVC(),
             param_grid={'C': [400, 500, 600],
                         'class_weight': [{0: 0.33, 1: 0.67},
                                          {0: 0.25, 1: 0.75},
                                          {0: 0.2, 1: 0.8}],
                         'gamma': [0.0001, 1e-05, 1e-06]},
             scoring='f1', verbose=2)

In [71]:
svc_clf = model_svc.best_estimator_


In [72]:
svc_clf

SVC(C=10, class_weight={0: 0.2, 1: 0.8}, gamma=0.01)

In [80]:
svc_clf2 = model_svc.best_estimator_


In [81]:
svc_clf2

SVC(C=20, class_weight={0: 0.2, 1: 0.8}, gamma=0.001)

In [85]:
svc_clf3 = model_svc.best_estimator_


In [86]:
svc_clf3

SVC(C=50, class_weight={0: 0.33, 1: 0.67}, gamma=0.001)

In [190]:
svc_clf4 = model_svc.best_estimator_


In [191]:
svc_clf4

SVC(C=600, class_weight={0: 0.25, 1: 0.75}, gamma=0.0001)

In [192]:
test_pred = svc_clf4.predict(X_test)
train_pred = svc_clf4.predict(X_train)

In [193]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
    
print("Scores for model on test set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_test,test_pred))))
print('Precision Score : {}'.format(str(precision_score(y_test,test_pred))))
print('Recall Score : {}' .format(str(recall_score(y_test,test_pred ))))
print('F1 Score : {}'.format(str(f1_score(y_test,test_pred))))
    
print("")
print("")
print("Scores for model on train set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_train,train_pred))))
print('Precision Score : {}'.format(str(precision_score(y_train,train_pred))))
print('Recall Score : {}' .format(str(recall_score(y_train,train_pred))))
print('F1 Score : {}'.format(str(f1_score(y_train,train_pred))))

Scores for model on test set

Accuracy Score : 0.737
Precision Score : 0.3706422018348624
Recall Score : 0.5246753246753246
F1 Score : 0.43440860215053767


Scores for model on train set

Accuracy Score : 0.764
Precision Score : 0.42321270962047663
Recall Score : 0.6227272727272727
F1 Score : 0.5039411455596426


# svc with optimized parameters

In [43]:
from sklearn.svm import SVC


In [44]:
svc_clf2 = SVC(C=10, class_weight={0: 0.33, 1: 0.67}, gamma=1,  probability=True)

In [45]:
svc_clf2.fit(X_train,y_train)

SVC(C=10, class_weight={0: 0.33, 1: 0.67}, gamma=1, probability=True)

In [55]:
test_pred_svc = svc_clf2.predict(X_test)
train_pred_svc= svc_clf2.predict(X_train)

In [124]:
X_final_pred = X_cl.copy()

In [125]:
X_final_pred

Unnamed: 0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,topic_10,...,topic_64,topic_65,topic_66,topic_67,topic_68,topic_69,topic_70,len_review,upper_word_count,exclam_count
0,0.164598,-0.037266,-0.017348,-0.033782,-0.039489,-0.017854,0.054365,0.003834,-0.077697,0.144274,...,-0.050269,0.012921,0.051010,-0.033804,-0.019719,-0.029457,-0.058698,359,low,high
1,0.063787,-0.011115,-0.007381,0.017662,0.014084,0.020458,-0.024529,-0.009330,-0.013766,-0.026596,...,-0.008165,-0.022340,0.003055,0.021081,-0.014691,0.022936,-0.009580,256,high,high
2,0.069113,-0.011790,-0.000103,0.012132,-0.009873,0.049683,-0.028817,-0.061029,-0.069624,-0.062647,...,0.037442,-0.020221,-0.003511,-0.021559,-0.006552,-0.018713,-0.005366,323,low,high
3,0.104303,-0.014484,-0.004573,0.021235,0.045719,0.170641,0.191690,0.042985,-0.017529,-0.018932,...,-0.020362,0.109305,0.050415,0.018773,0.048755,-0.029400,-0.032627,247,high,low
4,0.124755,-0.021469,-0.007286,0.001836,-0.015591,-0.016053,-0.036740,-0.018753,-0.034085,0.020315,...,-0.096016,0.041422,-0.121795,0.097878,0.036983,-0.021740,-0.023227,280,low,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90592,0.063376,-0.013901,-0.009343,-0.005924,-0.001504,-0.000381,-0.002383,0.000176,0.022544,0.004186,...,0.057061,-0.043134,-0.016470,0.026317,0.034931,-0.029842,-0.030917,195,high,very_high
90593,0.017429,-0.000662,-0.007080,-0.002795,-0.004131,0.006673,-0.005406,-0.003323,0.053630,0.005290,...,-0.034248,-0.021772,-0.032036,0.025454,-0.017804,0.017139,-0.021900,83,high,very_high
90594,0.023576,-0.006640,-0.005067,-0.002540,-0.003290,-0.004221,-0.006978,0.007569,0.005582,0.003396,...,-0.001202,-0.000442,-0.000618,0.004385,0.004228,0.001944,0.003267,391,high,very_high
90595,0.059606,-0.010925,-0.012032,0.000258,-0.009372,0.025755,0.003682,-0.027389,0.125269,0.006881,...,0.007580,-0.009157,0.021218,0.043926,0.016403,0.013264,-0.061758,315,high,high


In [126]:
test_pred = svc_clf2.predict(X_test)
train_pred = svc_clf2.predict(X_train)

In [56]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
    
print("Scores for model on test set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_test,test_pred_svc))))
print('Precision Score : {}'.format(str(precision_score(y_test,test_pred_svc))))
print('Recall Score : {}' .format(str(recall_score(y_test,test_pred_svc ))))
print('F1 Score : {}'.format(str(f1_score(y_test,test_pred_svc))))
    
print("")
print("")
print("Scores for model on train set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_train,train_pred_svc))))
print('Precision Score : {}'.format(str(precision_score(y_train,train_pred_svc))))
print('Recall Score : {}' .format(str(recall_score(y_train,train_pred_svc))))
print('F1 Score : {}'.format(str(f1_score(y_train,train_pred_svc))))

Scores for model on test set

Accuracy Score : 0.8094922737306843
Precision Score : 0.9310344827586207
Recall Score : 0.007765314926660914
F1 Score : 0.01540216771249287


Scores for model on train set

Accuracy Score : 0.9998482277136195
Precision Score : 0.9992097701149425
Recall Score : 1.0
F1 Score : 0.9996047288799453


In [52]:
test_pred_svc

array([0, 0, 0, ..., 0, 0, 0])

In [101]:
X_final_pred = preprocessor.transform(X_final_pred) 

In [102]:
predictions = svc_clf2.predict_proba(X_final_pred)

In [129]:
data_cl.is_fake_review

0        0
1        0
2        0
3        0
4        0
        ..
87046    1
87047    1
87048    1
87049    1
87050    1
Name: is_fake_review, Length: 87051, dtype: int64

In [105]:
predictions_svm_nlp = pd.DataFrame(predictions)

In [130]:
predictions_svm_nlp[3] = data_cl.is_fake_review

In [133]:
predictions_svm_nlp.sample(20)

Unnamed: 0,0,1,3
61455,0.95815,0.04185,0
15044,0.958839,0.041161,0
4216,0.957377,0.042623,0
71363,0.760686,0.239314,0
224,0.939691,0.060309,0
45542,0.975652,0.024348,0
79822,0.341989,0.658011,1
51552,0.763849,0.236151,0
67639,0.94026,0.05974,0
61252,0.938742,0.061258,0


# Log Reg 

In [309]:
from sklearn.linear_model import LogisticRegression
lg_clf = LogisticRegression()
lg_clf.fit(X_train, y_train)

LogisticRegression()

In [310]:
# Grid search cross validation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) 

parameters= {"C": np.logspace(-4, 4, 20), \
             "class_weight": [{1:0.67, 0:0.33}, {1:0.75, 0:0.25}, {1:0.8, 0:0.2}, "balanced"], \
            }

model = LogisticRegression()
model_gs =GridSearchCV(model, parameters, cv=kfold, verbose=2, scoring="f1")
model_gs.fit(X_train,y_train)

print("tuned hpyerparameters :(best parameters) ",model_gs.best_params_)
print("accuracy :",model_gs.best_score_)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV] C=0.0001, class_weight={1: 0.67, 0: 0.33} .......................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ........ C=0.0001, class_weight={1: 0.67, 0: 0.33}, total=   0.7s
[CV] C=0.0001, class_weight={1: 0.67, 0: 0.33} .......................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.7s remaining:    0.0s


[CV] ........ C=0.0001, class_weight={1: 0.67, 0: 0.33}, total=   0.6s
[CV] C=0.0001, class_weight={1: 0.67, 0: 0.33} .......................
[CV] ........ C=0.0001, class_weight={1: 0.67, 0: 0.33}, total=   0.6s
[CV] C=0.0001, class_weight={1: 0.67, 0: 0.33} .......................
[CV] ........ C=0.0001, class_weight={1: 0.67, 0: 0.33}, total=   0.6s
[CV] C=0.0001, class_weight={1: 0.67, 0: 0.33} .......................
[CV] ........ C=0.0001, class_weight={1: 0.67, 0: 0.33}, total=   0.7s
[CV] C=0.0001, class_weight={1: 0.75, 0: 0.25} .......................
[CV] ........ C=0.0001, class_weight={1: 0.75, 0: 0.25}, total=   0.7s
[CV] C=0.0001, class_weight={1: 0.75, 0: 0.25} .......................
[CV] ........ C=0.0001, class_weight={1: 0.75, 0: 0.25}, total=   0.8s
[CV] C=0.0001, class_weight={1: 0.75, 0: 0.25} .......................
[CV] ........ C=0.0001, class_weight={1: 0.75, 0: 0.25}, total=   0.9s
[CV] C=0.0001, class_weight={1: 0.75, 0: 0.25} .......................
[CV] .

[CV] ... C=0.0006951927961775605, class_weight=balanced, total=   0.9s
[CV] C=0.0006951927961775605, class_weight=balanced ..................
[CV] ... C=0.0006951927961775605, class_weight=balanced, total=   1.0s
[CV] C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33} ........
[CV]  C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33}, total=   0.9s
[CV] C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33} ........
[CV]  C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33}, total=   0.9s
[CV] C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33} ........
[CV]  C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33}, total=   0.9s
[CV] C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33} ........
[CV]  C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33}, total=   0.9s
[CV] C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33} ........
[CV]  C=0.0018329807108324356, class_weight={1: 0.67, 0: 0.33}, total=   0.9s
[CV] C=0.0018329807108324356, class_weight

[CV]  C=0.012742749857031334, class_weight={1: 0.8, 0: 0.2}, total=   1.4s
[CV] C=0.012742749857031334, class_weight=balanced ...................
[CV] .... C=0.012742749857031334, class_weight=balanced, total=   1.3s
[CV] C=0.012742749857031334, class_weight=balanced ...................
[CV] .... C=0.012742749857031334, class_weight=balanced, total=   1.1s
[CV] C=0.012742749857031334, class_weight=balanced ...................
[CV] .... C=0.012742749857031334, class_weight=balanced, total=   1.3s
[CV] C=0.012742749857031334, class_weight=balanced ...................
[CV] .... C=0.012742749857031334, class_weight=balanced, total=   1.2s
[CV] C=0.012742749857031334, class_weight=balanced ...................
[CV] .... C=0.012742749857031334, class_weight=balanced, total=   1.6s
[CV] C=0.03359818286283781, class_weight={1: 0.67, 0: 0.33} ..........
[CV]  C=0.03359818286283781, class_weight={1: 0.67, 0: 0.33}, total=   1.2s
[CV] C=0.03359818286283781, class_weight={1: 0.67, 0: 0.33} ........

[CV]  C=0.23357214690901212, class_weight={1: 0.8, 0: 0.2}, total=   1.7s
[CV] C=0.23357214690901212, class_weight={1: 0.8, 0: 0.2} ............
[CV]  C=0.23357214690901212, class_weight={1: 0.8, 0: 0.2}, total=   1.5s
[CV] C=0.23357214690901212, class_weight={1: 0.8, 0: 0.2} ............
[CV]  C=0.23357214690901212, class_weight={1: 0.8, 0: 0.2}, total=   1.4s
[CV] C=0.23357214690901212, class_weight={1: 0.8, 0: 0.2} ............
[CV]  C=0.23357214690901212, class_weight={1: 0.8, 0: 0.2}, total=   1.4s
[CV] C=0.23357214690901212, class_weight=balanced ....................
[CV] ..... C=0.23357214690901212, class_weight=balanced, total=   1.7s
[CV] C=0.23357214690901212, class_weight=balanced ....................
[CV] ..... C=0.23357214690901212, class_weight=balanced, total=   1.8s
[CV] C=0.23357214690901212, class_weight=balanced ....................
[CV] ..... C=0.23357214690901212, class_weight=balanced, total=   2.1s
[CV] C=0.23357214690901212, class_weight=balanced ...............

[CV]  C=4.281332398719396, class_weight={1: 0.75, 0: 0.25}, total=   2.1s
[CV] C=4.281332398719396, class_weight={1: 0.75, 0: 0.25} ............
[CV]  C=4.281332398719396, class_weight={1: 0.75, 0: 0.25}, total=   1.6s
[CV] C=4.281332398719396, class_weight={1: 0.8, 0: 0.2} ..............
[CV]  C=4.281332398719396, class_weight={1: 0.8, 0: 0.2}, total=   2.0s
[CV] C=4.281332398719396, class_weight={1: 0.8, 0: 0.2} ..............
[CV]  C=4.281332398719396, class_weight={1: 0.8, 0: 0.2}, total=   1.2s
[CV] C=4.281332398719396, class_weight={1: 0.8, 0: 0.2} ..............
[CV]  C=4.281332398719396, class_weight={1: 0.8, 0: 0.2}, total=   1.2s
[CV] C=4.281332398719396, class_weight={1: 0.8, 0: 0.2} ..............
[CV]  C=4.281332398719396, class_weight={1: 0.8, 0: 0.2}, total=   1.2s
[CV] C=4.281332398719396, class_weight={1: 0.8, 0: 0.2} ..............
[CV]  C=4.281332398719396, class_weight={1: 0.8, 0: 0.2}, total=   1.9s
[CV] C=4.281332398719396, class_weight=balanced ..................

[CV]  C=78.47599703514607, class_weight={1: 0.75, 0: 0.25}, total=   1.3s
[CV] C=78.47599703514607, class_weight={1: 0.75, 0: 0.25} ............
[CV]  C=78.47599703514607, class_weight={1: 0.75, 0: 0.25}, total=   1.3s
[CV] C=78.47599703514607, class_weight={1: 0.75, 0: 0.25} ............
[CV]  C=78.47599703514607, class_weight={1: 0.75, 0: 0.25}, total=   1.4s
[CV] C=78.47599703514607, class_weight={1: 0.75, 0: 0.25} ............
[CV]  C=78.47599703514607, class_weight={1: 0.75, 0: 0.25}, total=   1.3s
[CV] C=78.47599703514607, class_weight={1: 0.75, 0: 0.25} ............
[CV]  C=78.47599703514607, class_weight={1: 0.75, 0: 0.25}, total=   1.5s
[CV] C=78.47599703514607, class_weight={1: 0.8, 0: 0.2} ..............
[CV]  C=78.47599703514607, class_weight={1: 0.8, 0: 0.2}, total=   1.3s
[CV] C=78.47599703514607, class_weight={1: 0.8, 0: 0.2} ..............
[CV]  C=78.47599703514607, class_weight={1: 0.8, 0: 0.2}, total=   1.2s
[CV] C=78.47599703514607, class_weight={1: 0.8, 0: 0.2} ....

[CV]  C=1438.44988828766, class_weight={1: 0.67, 0: 0.33}, total=   2.1s
[CV] C=1438.44988828766, class_weight={1: 0.67, 0: 0.33} .............
[CV]  C=1438.44988828766, class_weight={1: 0.67, 0: 0.33}, total=   2.0s
[CV] C=1438.44988828766, class_weight={1: 0.75, 0: 0.25} .............
[CV]  C=1438.44988828766, class_weight={1: 0.75, 0: 0.25}, total=   1.8s
[CV] C=1438.44988828766, class_weight={1: 0.75, 0: 0.25} .............
[CV]  C=1438.44988828766, class_weight={1: 0.75, 0: 0.25}, total=   1.8s
[CV] C=1438.44988828766, class_weight={1: 0.75, 0: 0.25} .............
[CV]  C=1438.44988828766, class_weight={1: 0.75, 0: 0.25}, total=   1.8s
[CV] C=1438.44988828766, class_weight={1: 0.75, 0: 0.25} .............
[CV]  C=1438.44988828766, class_weight={1: 0.75, 0: 0.25}, total=   2.0s
[CV] C=1438.44988828766, class_weight={1: 0.75, 0: 0.25} .............
[CV]  C=1438.44988828766, class_weight={1: 0.75, 0: 0.25}, total=   1.9s
[CV] C=1438.44988828766, class_weight={1: 0.8, 0: 0.2} ........

[Parallel(n_jobs=1)]: Done 400 out of 400 | elapsed:  9.7min finished


tuned hpyerparameters :(best parameters)  {'C': 0.23357214690901212, 'class_weight': {1: 0.67, 0: 0.33}}
accuracy : 0.6890751149621503


In [429]:
lg = model_gs.best_estimator_


In [459]:
lg = LogisticRegression(C=0.004832930238571752, class_weight={0: 0.33, 1: 0.67})

In [460]:
lg.fit(X_train,y_train)

LogisticRegression(C=0.004832930238571752, class_weight={0: 0.33, 1: 0.67})

In [461]:
test_pred = lg.predict(X_test)
train_pred = lg.predict(X_train)

In [446]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
    
print("Scores for model on test set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_test,test_pred))))
print('Precision Score : {}'.format(str(precision_score(y_test,test_pred))))
print('Recall Score : {}' .format(str(recall_score(y_test,test_pred ))))
print('F1 Score : {}'.format(str(f1_score(y_test,test_pred))))
    
print("")
print("")
print("Scores for model on train set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_train,train_pred))))
print('Precision Score : {}'.format(str(precision_score(y_train,train_pred))))
print('Recall Score : {}' .format(str(recall_score(y_train,train_pred))))
print('F1 Score : {}'.format(str(f1_score(y_train,train_pred))))

Scores for model on test set

Accuracy Score : 0.8742273730684327
Precision Score : 0.6505025125628141
Recall Score : 0.7446074201898188
F1 Score : 0.6943811184122302


Scores for model on train set

Accuracy Score : 0.8784855885315341
Precision Score : 0.6621741894469167
Recall Score : 0.7488676396577756
F1 Score : 0.702857721245656


In [110]:
# set path and bucket name
PATH = "datasets/predictions_svm_nlp.csv"
bucket = s3.Bucket(name = "jedha-fake-reviews-project")
# export dataset as csv
data = predictions_svm_nlp.to_csv()

#upload to bucket
put_object = bucket.put_object(ACL='private', Key= PATH, Body=data)
#check 
for obj in bucket.objects.all():
    print(obj.key)

datasets/fake_reviews_raw.csv
datasets/full_dataset.csv
datasets/full_dataset_reworked.csv
datasets/predictions_svm_nlp.csv
datasets/real_reviews_raw.csv


In [49]:
bucket = s3.Bucket(name = "jedha-fake-reviews-project")

In [50]:
#check 
for obj in bucket.objects.all():
    print(obj.key)

datasets/fake_reviews_raw.csv
datasets/full_dataset.csv
datasets/full_dataset_reworked.csv
datasets/prediction_meta_data.csv
datasets/predictions_svm_nlp.csv
datasets/real_reviews_raw.csv
datasets/svc_predictions_meta_data.csv


In [160]:
dataset.to_csv("/Users/personal/Dropbox/dataset_tableau.csv", sep="}")

In [153]:
dataset.to_excel("/Users/personal/Dropbox/dataset_tableau.xlsx")  

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("

  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "
  warn("Ignoring URL '%s' since it exceeds Excel's limit of "


KeyboardInterrupt: 

In [161]:
ls

Meta_data_analysis.ipynb     Untitled.ipynb
Neural_network.ipynb         nlp_classifier.ipynb
Stacking_NLP_METADATA.ipynb


In [162]:
cd ../

/Users/personal/Dropbox/Jehda/nlp_project/git/Fake_reviews_detection


In [163]:
ls

[34m0_Scraping[m[m/        [34m2_Deployment[m[m/      [34mScraping[m[m/
[34m1_Training_models[m[m/ README.md          [34mTraining_models[m[m/


In [164]:
cd 1_Training_models

/Users/personal/Dropbox/Jehda/nlp_project/git/Fake_reviews_detection/1_Training_models


In [166]:
mkdir text_only_models

In [167]:
cd text_only_models

/Users/personal/Dropbox/Jehda/nlp_project/git/Fake_reviews_detection/1_Training_models/text_only_models


In [171]:
import joblib

In [172]:

# Save to file in the current working directory
joblib_file = "text_vectorizer.pkl"
joblib.dump(vectorizer, joblib_file)




['text_vectorizer.pkl']

In [46]:
import joblib

In [47]:
# Save to file in the current working directory
joblib_file = "topic_extractor.pkl"
joblib.dump(svd, joblib_file)


['topic_extractor.pkl']

In [175]:
# Save to file in the current working directory
joblib_file = "main_model.pkl"
joblib.dump(svc_clf2, joblib_file)

['main_model.pkl']

In [176]:
# Save to file in the current working directory
joblib_file = "preprocessor.pkl"
joblib.dump(preprocessor, joblib_file)

['preprocessor.pkl']

In [51]:
vectorizer

TfidfVectorizer(min_df=200)

Meta_data_analysis.ipynb     nlp_classifier.ipynb
Neural_network.ipynb         [34mtext_only_models[m[m/
Stacking_NLP_METADATA.ipynb  topic_extractor.pkl
Untitled.ipynb


In [55]:

# Save to file in the current working directory
joblib_file = "text_vectorizer2.pkl"
joblib.dump(vectorizer, joblib_file)

['text_vectorizer2.pkl']