# Import and Clean DS

In [7]:
import pandas as pd
import boto3
import glob
import io
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
import fr_core_news_sm
from s3_credentials import *
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [None]:
nlp = fr_core_news_sm.load()

In [6]:
cd '/Users/admin/Jedha/'

/Users/admin/Jedha


In [8]:
# connect to s3 instance

#YOUR_ACCESS_KEY = 
#YOUR_SECRET_KEY = 

session = boto3.Session(aws_access_key_id= YOUR_ACCESS_KEY, 
                        aws_secret_access_key= YOUR_SECRET_KEY)

s3 = session.resource("s3")
client = session.client("s3")

In [9]:
#full DS
obj = s3.Object('jedha-fake-reviews-project', "datasets/full_dataset.csv")
dataset = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)

In [10]:
#_____________________________________________________________________
######### Cleaning the dataset and adding new columns #########
#_____________________________________________________________________

#we drop rows in which restaurant infos are not available (miss scraped)
dataset = dataset.dropna(subset = ['restaurant_average_rating', 'restaurant_reviews_count', 'restaurant_expensiveness', 'restaurant_name'])

#adding a column with the length of the text review
dataset['text_length'] = dataset['text_review'].apply(lambda x : len(x))

#_____________________________________________________________________
######### Fixing existing columns values and types #########
#_____________________________________________________________________

#for the user_total_image_posted column, if user_total_image_posted is NA it means there is there's no image
    # so we set the value to 0
dataset.loc[dataset['user_total_image_posted'].isna(), 'user_total_image_posted'] = 0

#for the date column,  there is some miss scraps that we want to fix
    # a correct data must have a length of 10 , if it is smaller than 10 it's becasue we scrapped the number of images of the user instead
    # we may have to scrap again those lines to fix it
    # we keep only the rows where the date is correct 
mask_not_date = dataset['date'].apply(lambda x: len(x)) < 10
dataset = dataset.loc[mask_not_date == False, :]
    # if te length is greater than 10 is it is beacause we scraped the date + somme additional words ('Avis mis à jour') so we will keep only the part with the date
mask_date_to_fix = dataset['date'].apply(lambda x: len(x)) > 10
dataset.loc[mask_date_to_fix, 'date' ] = dataset.loc[mask_date_to_fix, 'date' ].str.split('\n').str[0]
    #finally we can convert the date column to a datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

#for the photos_for_review column, 
    # value -1 is in fact 0 (no photos found by the scraper)
dataset.loc[dataset['photos_for_review'] == '-1.0', 'photos_for_review' ] = '0'
    # value L is in fact 0 (no photos found by the scraper but scraped the first letter of "L'avis du jour" which happens when the reviews was updated by the user)
dataset.loc[dataset['photos_for_review'] == 'L', 'photos_for_review' ] = '0'
    # finally we can convert the photos_for_review column to an int format
dataset['photos_for_review'] = dataset['photos_for_review'].astype('int')

#for the photos_for_review column, 
    # when there's no info about the expensiveness we set it to -1
dataset.loc[dataset['restaurant_expensiveness'] == 'N/C', 'restaurant_expensiveness']  = -1
    # we can convert the restaurant_expensiveness column to an int format
dataset['restaurant_expensiveness'] = dataset['restaurant_expensiveness'].astype('int')

# change is real review for is fake review as it's better for sklearn 
dataset["is_fake_review"] = dataset["is_real_review"].apply(lambda x: '1' if x == 0 else '0')
dataset["is_fake_review"] = dataset["is_fake_review"].astype(int)
dataset = dataset.drop(columns="is_real_review")

# reset index 
dataset = dataset.reset_index(drop = True)


In [11]:
french_reviews = dataset.loc[dataset['language'] =='fr',['text_review', 'is_fake_review']].reset_index(drop=True)

# Preprocessing for NLP

In [14]:
data = french_reviews.copy()

In [27]:
from unidecode import unidecode

In [28]:
#Clean and Lemmatize the data

#removes '/n', and any non alphanumeric character and finally if there's more than one space in a row it turns it to one space
data["text_review_clean"] = data["text_review"].str.replace(r"[\n]*?[^A-zÀ-ÿ0-9' ]+", ' ').str.replace(r" +"," ").str.lower()
data['text_review_clean'] = data['text_review_clean'].apply(lambda x : unidecode(str(x)))
data["text_review_clean"] = data["text_review_clean"].apply(lambda x: " ".join([token.lemma_ for token in nlp(x) if token.lemma_ not in 
STOP_WORDS]))



# Tokenizing, lemmatizing and deleteing stopwords from doc with Spacy


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [16]:
data['text_review_clean'] = data['text_review_clean'].astype(str)

In [158]:
# apply vectorizer to the review column
vectorizer = TfidfVectorizer(smooth_idf=True, min_df=200)
X = vectorizer.fit_transform(data['text_review_clean'])

In [159]:
len(vectorizer.vocabulary_)

2102

# Topic Extraction

In [19]:
# import from sklearn
from sklearn.decomposition import TruncatedSVD

In [160]:
# set it to 12 different topics 
svd = TruncatedSVD(n_components= 500)
# fit to our matrix --> last two columns are those with the previous cluster_values
lsa = svd.fit_transform(X)

In [40]:
pd.DataFrame(lsa).to_csv('lsa.csv')

TypeError: to_csv() got an unexpected keyword argument 'index_col'

In [12]:
lsa = pd.read_csv('lsa.csv', index_col = 0)

In [21]:
print(svd.explained_variance_ratio_.sum())

0.5976252366728456


In [14]:
data = french_reviews.copy()

# Feature Engineering

In [185]:
len_review = data["text_review"].apply(lambda x : len(str(x)))
#len_review = pd.qcut(len_review_n, 2, labels = ['low', 'high'])


In [186]:
data["text_review"] = data['text_review'].str.replace('\n', ' ')


In [486]:
data['clean'] = data["text_review"].apply(lambda x : "".join(ch for ch in x if ch.isalnum() or ch ==' ' )).str.lower()
data['clean_up'] = data["text_review"].apply(lambda x : "".join(ch for ch in x if ch.isalnum() or ch ==' ' ))

In [387]:
data['upper_word_count'] = data['clean'].apply(lambda x : sum(map(str.isupper, x.split())) )
#upper_word_count = pd.qcut(data['upper_word_count'].rank(method = 'first'), 3, labels = ['low', 'mid', 'high'])
#upper_word_count = pd.cut(data['upper_word_count'], bins = [0,1,160], labels = ['low', 'high'], include_lowest=True)
#upper_word_count = pd.qcut(data['upper_word_count'], 3, labels = ['low', 'mid','high'])

In [None]:
superlatifs = ['bon', 'bonne', 'bons', 'bonnes', 'meilleur', 'meilleure', 'meilleures',
'mauvais', 'mauvaise', 'pire','petit', 'petite', 'moindre','plus', 'mieux', 'gros', 'impossible', 'totalement', 'loin', 'absence', 'très', 'tres', 'décu', 'decu', 'trop', 'jamais', 'toujours', 'aucun', 'déplorable', 'éviter', 'eviter', 'absolument', 'infect', 'infecte', 'fuir', 'fuire']

In [485]:
data['clean'].head()

0    bon retour  je suis revenue dans ce resto aprè...
1    a optimiser cuisine très traditionnelle dans u...
2    brasserie chic une brasserie authentiquement p...
3    tres bien petit diner entre amis les plats eta...
4    un bistrot bien sympathique nous avons mangé e...
Name: clean, dtype: object

In [None]:
#New features :
#Word count
data['word_count'] = data['text_review'].apply(lambda x : len(x.split()) )
#Words in uppercase
data['upper_word_count'] = data['clean_up'].apply(lambda x : sum(map(str.isupper, x.split())) )
#punctuation_count
data['punctuation'] = data['text_review'].apply(lambda x : len(''.join(ch for ch in x if ch ==',' or ch =='.' or ch ==';' or ch ==':' or ch =='?' or ch.isnumeric() or ch =='!')))
#Count of Superlatives
data['sup_count'] = data['clean'].apply(lambda x :  len([word for word in x.split() if word in superlatifs]) )
#amoount of character euro
data['euros'] = data['clean'].apply(lambda x : len(''.join(ch for ch in x if ch =='€')))
#Negation
data['negation'] = data['clean'].apply(lambda x :  len([word for word in x.split() if word in ['n', 'ne']]) )

In [511]:
data['exclam_count']

0        2
1        1
2        1
3        0
4        0
        ..
90592    2
90593    2
90594    3
90595    0
90596    0
Name: exclam_count, Length: 90597, dtype: int64

In [513]:
ss = StandardScaler()

In [517]:
word_count =  ss.fit_transform(data['word_count'].values.reshape(1,-1))
upper_word_count =  ss.fit_transform(data['upper_word_count'].values.reshape(1,-1) )
punctuation =  ss.fit_transform(data['punctuation'].values.reshape(1,-1) )
sup_count =  ss.fit_transform(data['sup_count'].values.reshape(1,-1) )
negation =  ss.fit_transform(data['negation'].values.reshape(1,-1) )
exclam_count =  ss.fit_transform(data['exclam_count'].values.reshape(1,-1) )


In [527]:
indicateur = (1+negation)*(1+punctuation)*(1+sup_count)*(1+upper_word_count)/(1+word_count*exclam_count)

In [508]:
data['indicateur'] = (data['negation'])*(1+data['punctuation'])*(1+data['sup_count'])*(1+data['upper_word_count'])/((data['word_count']*(1+data['exclam_count'])))
data['log_indic'] = np.log(data['indicateur'])

In [535]:
data['exclam_count'].head()

0    2
1    1
2    1
3    0
4    0
Name: exclam_count, dtype: int64

In [456]:
data['log_indic'] = pd.qcut(data['log_indic'], 5, labels = ['low', 'mid', 'high','4','5'])

In [528]:
data['indicateur'] =  pd.qcut(data['indicateur'], 3, labels = ['low', 'mid', 'high'])

ValueError: Bin edges must be unique: array([  0. ,   0. ,   0. , 277.2]).
You can drop duplicate edges by setting the 'duplicates' kwarg

In [530]:
ind = pd.DataFrame(indicateur)

In [533]:
ind.value_counts()

Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x7fbabbc2c3b0>
Traceback (most recent call last):
  File "/Users/admin/opt/anaconda3/lib/python3.7/weakref.py", line 358, in remove
    def remove(k, selfref=ref(self)):
KeyboardInterrupt


.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    

In [531]:
pd.qcut(ind, 3, labels = ['low', 'mid', 'high'])

ValueError: Input array must be 1 dimensional

In [501]:
data['indicateur'].value_counts().values

array([64449,   113,    93, ...,     1,     1,     1])

In [493]:
sns.histplot(data = data, x = 'indicateur')

KeyboardInterrupt: 

In [None]:
upper_word_count = pd.qcut(data['upper_word_count'].rank(method = 'first'), 3, labels = ['low', 'mid', 'high'])

In [234]:
data['word_count'] = data['text_review'].apply(lambda x : len(x.split()) )

In [226]:
#data['exclam_count'] = data['text_review'].apply(lambda x : len(''.join(ch for ch in x if ch =='!')))
data['punctuation'] = data['text_review'].apply(lambda x : len(''.join(ch for ch in x if ch ==',' or ch =='.' or ch ==';' or ch ==':' or ch =='?' or ch.isnumeric() or ch =='!')))
#data['qmark'] = data['text_review'].apply(lambda x : len(''.join(ch for ch in x if ch =='?')))
#data['num'] = data['text_review'].apply(lambda x : len(''.join(ch for ch in x if ch.isnumeric())))
#exclam_count = pd.qcut(data['exclam_count'].rank(method = 'first'), 3, labels = ['low', 'low-mid', 'very_high'])
#exclam_count = pd.qcut(data['exclam_count'], 5, labels = ['low', 'mid', 'high', 'hh', 'hhh'],)
#exclam_count = pd.cut(data['exclam_count'], bins =  [0, 1, 3, 5, 133], labels = ['low','high', 'hhh', 'hhhh'], include_lowest=True)
#data['emotion'] = (data['exclam_count']+1)*(data['upper_word_count']+1)/*len_review_n)


In [609]:
 #df = pd.concat([pd.DataFrame(lsa), len_review, emotion, upper_word_count ], axis = 1)
#df = pd.concat([pd.DataFrame(lsa), data['word_count'], data['upper_word_count'], data['punctuation'], data['sup_count'], data['euros'], data['negation'], data['exclam_count'] ], axis = 1)

df = pd.concat([data['word_count'], data['upper_word_count'], data['punctuation'], data['sup_count'], data['euros']], axis = 1)

In [610]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [611]:
# split y 
y = data["is_fake_review"]

In [612]:
X_train, X_test, y_train, y_test = train_test_split(df,y, 
                                                    test_size = 0.2,
                                                    stratify = y , ## Statify splitting when you're training a classification model !
                                                    random_state = 19)

90598

In [613]:
# Create pipeline for categorical features

X_train, X_test, y_train, y_test = train_test_split(df,y, 
                                                    test_size = 0.2,
                                                    stratify = y , ## Statify splitting when you're training a classification model !
                                                    random_state = 19)

#categorical_features = [index for index, c in enumerate(df.columns) if c in ['text_review','emotion', 'upper_word_count' ] ]
#numerical_features = [index for index, c in enumerate(df.columns) if c not in ['text_review','emotion', 'upper_word_count' ] ] # 
# Positions of categorical columns in X_train/X_test
#categorical_features = [index for index, c in enumerate(df.columns) if c in ['text_review',	'emotion',	'upper_word_count',	'exclam_count'] ]
#numerical_features = [index for index, c in enumerate(df.columns) if c not in ['text_review',	'emotion',	'upper_word_count',	'exclam_count'] ] # 

numerical_features = [i for i in range(len(df.columns))]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

#
#categorical_transformer = Pipeline(
#    steps=[
#    ('encoder', OneHotEncoder(drop='first'))
#    ])# first column will be dropped to avoid creating correlations between features



In [614]:
# Create pipeline for numeric features

# Use ColumnTranformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', numeric_transformer, numerical_features),
#        ('cat', categorical_transformer, categorical_features)
    ])


In [615]:
# Preprocessings on train set

X_train = preprocessor.fit_transform(X_train)

# Preprocessings on test set
X_test = preprocessor.transform(X_test) 

#Whole Set

#X_w = preprocessor.transform(df)


In [616]:
X_test.shape

(18120, 5)

In [260]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold


In [90]:
kfold = StratifiedKFold(n_splits = 5, shuffle=True, random_state=0) 

parameters= {'C': [10], \
            'gamma': [1] ,
             "class_weight": [{1:0.67, 0:0.33}, {1:0.75, 0:0.25}, {1:0.8, 0:0.2}, "balanced"] \
           }

model = SVC()
model_svc =GridSearchCV(model, parameters, cv=kfold, verbose=2, scoring="f1")
model_svc.fit(X_train,y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.9min
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.9min remaining:    0.0s


[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.6min
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.7min
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.8min
[CV] C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.67, 0: 0.33}, gamma=1, total= 1.7min
[CV] C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1, total= 1.9min
[CV] C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1, total= 1.8min
[CV] C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1 ..................
[CV] ... C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1, total= 1.6min
[CV] C=10, class_weight={1: 0.75, 0: 0.25}, gamma=1 ..................
[CV] .

[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed: 43.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=SVC(),
             param_grid={'C': [10],
                         'class_weight': [{0: 0.33, 1: 0.67},
                                          {0: 0.25, 1: 0.75}, {0: 0.2, 1: 0.8},
                                          'balanced'],
                         'gamma': [1]},
             scoring='f1', verbose=2)

In [468]:
# Train model
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=0.95, class_weight={0:1, 1:2.1}, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


model.fit(X_train, y_train)

LogisticRegression(C=0.95, class_weight={0: 1, 1: 2.1})

In [469]:
from sklearn.metrics import f1_score
print('f1 test :',f1_score(y_test, model.predict(X_test)))
print('f1 train :',f1_score(y_train, model.predict(X_train)))

f1 test : 0.0
f1 train : 0.0


In [58]:
mask = french_reviews['is_fake_review']==1

In [61]:
df[mask].head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,493,494,495,496,497,498,499,text_review,exclam_count,upper_word_count
73211,0.270003,-0.234059,0.009681,-0.089303,-0.005257,-0.056454,-0.080683,-0.032563,0.051386,0.007209,...,0.002946,-0.03268,-0.007632,0.002838,0.007894,-0.026271,-0.011672,high,high,high
73212,0.174517,0.117866,-0.057288,0.010194,-0.099414,-0.086875,-0.082387,0.065344,-0.09006,0.037045,...,-0.036609,0.073429,-0.028386,-0.033309,-0.062453,0.053776,-0.001346,low,high,mid
73213,0.148645,-0.041817,-0.158227,0.114529,0.066686,0.035117,0.003303,0.066757,0.081229,-0.079213,...,-0.00068,0.000233,-0.008877,0.031703,0.023139,-0.033131,0.026393,low,high,high
73214,0.138248,-0.006416,0.003036,0.165525,-0.059385,0.013231,-0.050222,0.024927,0.021774,-0.033123,...,-0.007123,0.005827,0.003464,-0.010398,0.003535,0.012278,-0.000651,low,very_high,mid
73215,0.209048,-0.050243,-0.002659,0.164238,0.154815,0.025351,-0.019732,0.030266,0.036267,-0.057702,...,-0.01947,0.005984,0.033023,0.007848,-0.043087,-0.043972,-0.03577,low,very_high,high


In [60]:
model.predict(df[mask])

ValueError: could not convert string to float: 'high'

In [164]:
svc_model = SVC(C=1.0, break_ties=False, cache_size=200, class_weight={0:1, 1:1.8}, coef0=0.0,
                decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
                max_iter=-1, probability=False, random_state=None, shrinking=True,
                tol=0.001, verbose=False)

svc_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import f1_score
print('f1 test :',f1_score(y_test, svc_model.predict(X_test)))
print('f1 train :',f1_score(y_train, svc_model.predict(X_train)))

In [95]:
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
    
print("Scores for model on test set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_test,test_pred))))
print('Precision Score : {}'.format(str(precision_score(y_test,test_pred))))
print('Recall Score : {}' .format(str(recall_score(y_test,test_pred ))))
print('F1 Score : {}'.format(str(f1_score(y_test,test_pred))))
    
print("")
print("")
print("Scores for model on train set")
print("")
print('Accuracy Score : {}'.format(str(accuracy_score(y_train,train_pred))))
print('Precision Score : {}'.format(str(precision_score(y_train,train_pred))))
print('Recall Score : {}' .format(str(recall_score(y_train,train_pred))))
print('F1 Score : {}'.format(str(f1_score(y_train,train_pred))))

Scores for model on test set

Accuracy Score : 0.8770317615300672
Precision Score : 0.5947910357359176
Recall Score : 0.7097940007228045
F1 Score : 0.6472235953204811


Scores for model on train set

Accuracy Score : 0.8775703618609995
Precision Score : 0.5962121212121212
Recall Score : 0.7111874209289716
F1 Score : 0.6486441935218


In [97]:
svc_clf2 = SVC(C=10, class_weight={0: 0.33, 1: 0.67}, gamma=1,  probability=True)

In [98]:
svc_clf2.fit(X_train,y_train)

SVC(C=10, class_weight={0: 0.33, 1: 0.67}, gamma=1, probability=True)

In [110]:
# set path and bucket name
PATH = "datasets/predictions_svm_nlp.csv"
bucket = s3.Bucket(name = "jedha-fake-reviews-project")
# export dataset as csv
data = predictions_svm_nlp.to_csv()

#upload to bucket
put_object = bucket.put_object(ACL='private', Key= PATH, Body=data)
#check 
for obj in bucket.objects.all():
    print(obj.key)

datasets/fake_reviews_raw.csv
datasets/full_dataset.csv
datasets/full_dataset_reworked.csv
datasets/predictions_svm_nlp.csv
datasets/real_reviews_raw.csv


In [192]:
import joblib

In [193]:

# Save to file in the current working directory
joblib_file = "text_vectorizer.pkl"
joblib.dump(vectorizer, joblib_file)




['text_vectorizer.pkl']

In [194]:
# Save to file in the current working directory
joblib_file = "topic_extractor.pkl"
joblib.dump(svd, joblib_file)


['topic_extractor.pkl']

In [195]:
# Save to file in the current working directory
joblib_file = "main_model.pkl"
joblib.dump(model, joblib_file)

['main_model.pkl']

In [234]:
# Save to file in the current working directory
joblib_file = "preprocessor.pkl"
joblib.dump(preprocessor, joblib_file)

['preprocessor.pkl']

In [626]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100)

In [627]:
clf.fit(X_train, y_train)

RandomForestClassifier()

In [603]:
from sklearn.naive_bayes import GaussianNB

In [617]:
gnb = GaussianNB()

In [618]:
gnb.fit(X_train, y_train)

GaussianNB()

In [624]:
clf.score(X_train, y_train)

0.8719869751783325

In [620]:
X_train.shape

(72477, 5)

In [644]:

clf = RandomForestClassifier(n_estimators=100)

In [639]:
clf.fit(X_train, y_train)

RandomForestClassifier(min_samples_split=15)

In [642]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
XPC_train = pca.fit_transform(X_train)
XPC_test = pca.transform(X_test)

In [645]:
clf.fit(XPC_train, y_train)

RandomForestClassifier()