In [1]:
# connect to s3 instance
import boto3
YOUR_ACCESS_KEY = ""
YOUR_SECRET_KEY = ""

session = boto3.Session(aws_access_key_id= YOUR_ACCESS_KEY, 
                        aws_secret_access_key= YOUR_SECRET_KEY)

s3 = session.resource("s3")
client = session.client("s3")

In [2]:
import pandas as pd
import glob

In [3]:
import io

In [None]:
# real
obj = s3.Object('jedha-fake-reviews-project', "datasets/full_dataset.csv")
full_dataset = pd.read_csv(io.BytesIO(obj.get()['Body'].read()), low_memory = False, index_col=0)

In [None]:
full_dataset

In [None]:
sample = full_dataset[full_dataset["language"] == "fr"].sample(30000)

In [None]:
sample

In [None]:
sample = sample[["text_review", "language", "is_real_review"]]

In [None]:
sample.reset_index(inplace=True)

In [None]:
pd.set_option('max_colwidth', 1000)

In [None]:
sample

In [None]:
import pandas as pd
import numpy as np 
import spacy
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN
import fr_core_news_md

In [None]:
data = sample.copy()

In [None]:
data["len_review"] = data["text_review"].apply(lambda x : len(str(x)))

In [None]:
data["text_review_clean"] = data["text_review"].str.strip()

#lower
data["text_review_clean"] = data["text_review_clean"].str.lower()


In [None]:
data["text_review_clean"] = data["text_review_clean"].str.replace(r"<[a-z/]+>", " ")


In [None]:
import string

# def function
def remove_punctuation(text): 
    return text.translate(str.maketrans("", "", string.punctuation))

# apply to column
data["text_review_clean"] = data["text_review_clean"].apply(remove_punctuation)

In [None]:
data.sample(2)

# Tokenizing, lemmatizing and deleteing stopwords from doc with Spacy


In [None]:

# first let's find the count of all words and return them in the form of dict items
from collections import Counter

word_count = Counter(' '.join(data["text_review_clean"]).split()).items() #
print(len(word_count))

In [None]:
# create df with all words and their count
word_count = pd.DataFrame({'word': [item[0] for item in list(word_count)], 
             'count' : [item[1] for item in list (word_count)]})

# format
word_count = word_count.sort_values('count', ascending = False)

In [None]:
print(word_count.shape)
word_count.head(2)

In [None]:
# take all words that occur more than 500 times
commonwords = word_count.loc[word_count["count"]>=2000, :]
commonwords

In [None]:

# create nlp instance
nlp =  fr_core_news_md.load()

In [None]:

# lemmatize common words 
commonwords["word"] = commonwords["word"].apply(lambda x: nlp(x))
commonwords["word"] = commonwords["word"].apply(lambda x: [token.lemma_ for token in x])
commonwords.head(5)

In [None]:
# join
commonwords["word"] = commonwords["word"].str.join("")

In [None]:
# make list
common_words = commonwords.word
common_words

In [None]:
# append to stopwords 
from spacy.lang.fr.stop_words import STOP_WORDS
print(len(STOP_WORDS))
STOP_WORDS_MAX = STOP_WORDS.union(common_words)

# also add the lemmatizer for pronouns as we won't need them
STOP_WORDS_MAX.add("-PRON-")
print(len(STOP_WORDS_MAX))

In [None]:

#  apply nlp to transform into doc
data["clean_tokens"] = data["text_review_clean"].apply(lambda x: nlp(x))

In [None]:
data.head(5)

In [None]:
# lemmatize each token and remove stop words --> could be done in two steps but we do it in one
data['clean_tokens_lemmatized'] = data['clean_tokens'].apply(lambda doc: [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS_MAX])
data.head(5)

In [None]:
### join all of them into new df column
# method 1
data["clean_review"] = data["clean_tokens_lemmatized"].str.join(" ")

In [None]:
len(data)

# Creating a TFIDF Matrix


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
# apply vectorizer to the review column
vectorizer = TfidfVectorizer(smooth_idf=True)
X = vectorizer.fit_transform(data['clean_review'])

In [None]:

# transform this sparse matrix into a numpy array 
X_dense = X.toarray()
print(X_dense.shape)

In [None]:
# let's check out the vocabulary of this doc
print(len(vectorizer.vocabulary_))
vectorizer.vocabulary_

In [None]:

# Let's put the matrix into a DF with the feature name (ie word) as column title and the document number as ID
# this is easily doable because the get_feature_names method of vectorizer returns the feature names 
# with the same index as their values in the X_dense matrix
X_df = pd.DataFrame(X_dense, 
             columns=[x for x in vectorizer.get_feature_names()], 
             index=["review_{}".format(i) for i in range (1,30001)])

In [None]:
X_df

# Topic Extraction

In [None]:
# import from sklearn
from sklearn.decomposition import TruncatedSVD

In [None]:
# set it to 12 different topics 
svd = TruncatedSVD(n_components= 80)

# fit to our matrix --> last two columns are those with the previous cluster_values
lsa = svd.fit_transform(X_df)

In [None]:
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_{}".format(i) \
                                                for i in range(1,(lsa.shape[1]+1))]\
                               )
topic_encoded_df.head()

# Clean Data For Classifier

In [None]:
data_cl = topic_encoded_df.copy()

In [None]:
data_cl["is_real_review"] = list(data["is_real_review"])

In [None]:
data_cl["len_review"] = list(data.len_review)

In [None]:
data_cl

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


In [None]:
data_cl.groupby("is_real_review").count()

In [None]:
data_cl["is_fake_review"] = data_cl["is_real_review"].apply(lambda x: '1' if x == 0 else '0')
data_cl["is_fake_review"] = data_cl["is_fake_review"].astype(int)
data_cl = data_cl.drop(columns="is_real_review")

In [None]:
X_cl = data_cl.drop(columns="is_fake_review")
X_cl.head()

In [None]:
y = data_cl["is_fake_review"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_cl,y,
                                                    test_size = 0.2,
                                                    stratify = y , ## Statify splitting when you're training a classification model !
                                                    random_state = 0)

In [None]:
scaler = StandardScaler()
X_train["len_review"] = scaler.fit_transform(X_train[["len_review"]])
X_test["len_review"] = scaler.transform(X_test[["len_review"]])

In [None]:
## defining a function that prints out the scores of a given classifier
def print_scores(model_name, X_train = X_train , X_test = X_test, y_test = y_test , y_train = y_train): 
    
    from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
    
    print("Scores for model on test set")
    print("")
    print('Accuracy Score : {}'.format(str(accuracy_score(y_test,model_name.predict(X_test)))))
    print('Precision Score : {}'.format(str(precision_score(y_test,model_name.predict(X_test)))))
    print('Recall Score : {}' .format(str(recall_score(y_test,model_name.predict(X_test)))))
    print('F1 Score : {}'.format(str(f1_score(y_test,model_name.predict(X_test)))))
    
    print("")
    print("")
    print("Scores for model on train set")
    print("")
    print('Accuracy Score : {}'.format(str(accuracy_score(y_train,model_name.predict(X_train)))))
    print('Precision Score : {}'.format(str(precision_score(y_train,model_name.predict(X_train)))))
    print('Recall Score : {}' .format(str(recall_score(y_train,model_name.predict(X_train)))))
    print('F1 Score : {}'.format(str(f1_score(y_train,model_name.predict(X_train)))))

In [None]:
import seaborn as sns

In [None]:
# def function that prints confusion matrix 

def show_confusion_matrix(model_name,X_train = X_train , X_test = X_test, y_test = y_test , y_train = y_train ): # def_function to show confusion_matrix

    import matplotlib.pyplot as plt 
    from sklearn.metrics import confusion_matrix

    fig, (ax1, ax2) = plt.subplots(1,2,figsize=(10, 4))  

    ax1.set_title('Confusion Matrix of the test set')
    ax1.set_xlabel("Predicted Values")
    ax1.set_ylabel("Actual Values")
    
    ax2.set_title('Confusion Matrix of the train set')
    ax2.set_xlabel("Predicted Values")
    ax2.set_ylabel("Actual Values")
    
    cfm_test = confusion_matrix(y_test,model_name.predict(X_test))
    cfm_train = confusion_matrix(y_train,model_name.predict(X_train))
    sns.heatmap(cfm_test, annot=True, fmt="g", cmap="seismic", ax=ax1, )
    sns.heatmap(cfm_train, annot=True, fmt="g", cmap="seismic", ax=ax2)
    
    
    plt.tight_layout(), plt.show()

# Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# rf_clf = RandomForestClassifier()
# rf_clf.fit(X_train, y_train)

In [None]:
# Random search random forest CV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits = 10, shuffle=True, random_state=0) 

parameters= {"criterion": ["gini"], \
             "class_weight": [{1:0.67, 0:0.33}, {1:0.75, 0:0.25}, {1:0.8, 0:0.2}, "None", "balanced"], \
            "max_depth": range(5,50) , \
            "min_samples_leaf" : range(5,50), \
            "min_samples_split" : [2, 5, 10, 20, 30], \
            "n_estimators" : [10, 50, 100, 200]}

model = RandomForestClassifier("")
model_rs =RandomizedSearchCV(model, parameters, cv=kfold, verbose=2, n_iter=10, scoring="f1")
model_rs.fit(X_train,y_train)

In [None]:
rf_clf = model_rs.best_estimator_


In [None]:
show_confusion_matrix(rf_clf)


In [None]:
print_scores(rf_clf)

# SVM

In [None]:
from sklearn.svm import SVC
svc_clf = SVC()

In [None]:
svc_clf.fit(X_train, y_train)

In [None]:
# kfold = StratifiedKFold(n_splits = 10, shuffle=True, random_state=0) 

# parameters= {'C': [1,10,100,1000], \
             "class_weight": [{1:0.67, 0:0.33}, {1:0.75, 0:0.25}, {1:0.8, 0:0.2}, "None", "balanced"], \
            'gamma': [1,0.1,0.001,0.0001] , \
            'kernel':['linear','rbf']}

# model = SVC("")
# model_svc =RandomizedSearchCV(model, parameters, cv=kfold, verbose=2, n_iter=10, scoring="f1")
# model_svc.fit(X_train,y_train)

In [None]:
# svc_clf = model_rs.best_estimator_


In [None]:
show_confusion_matrix(svc_clf)


In [None]:
print_scores(svc_clf)