## Imports:

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import keras

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [None]:
dataframe = pd.read_csv("amazon_reviews_labelled1.csv")
dataframe.head(5)

## Data Cleaning:

In [None]:
dataframe.isnull().sum()

In [None]:
columns_to_drop = [
    'Unnamed: 0.4',
    'Unnamed: 0.3',
    'Unnamed: 0.2',
    'Unnamed: 0.1',
    'Unnamed: 0',
    'AVERAGE_RATING',
    'NUM_REVIEWS',
    'SENTIMENT_CATEGORY_ENCODED',
    'RATING_CATEGORY_ENCODED',
    'COHERENT_ENCODED',
    'NUM_NAMED_ENTITIES',
    'CAPITAL_CHAR_COUNT',
    'PUNCTUATION_COUNT',
    'PREPROCESSED_REVIEW_TEXT',
    'SENTIMENT_SCORE_TITLE',
    'SENTIMENT_LABEL_TITLE',
    'AVG_RATING_VERIFIED',
    'AVG_RATING_NON_VERIFIED',
    'DEVIATION_NON_VERIFIED',
    'Unnamed: 36',
    'Unnamed: 37',
    'Unnamed: 38'
    ]


#REMOVING METADATA
'''
columns_to_drop = [
    'Unnamed: 0.4',
    'Unnamed: 0.3',
    'Unnamed: 0.2',
    'Unnamed: 0.1',
    'Unnamed: 0',
    'RATING', 'VERIFIED_PURCHASE', 'TITLE_LENGTH', 'RATING_DEVIATION', 'READABILITY_FRE', 'DEVIATION_VERIFIED', 'SENTIMENT_SCORE',
    'AVERAGE_RATING',
    'NUM_REVIEWS',
    'SENTIMENT_CATEGORY_ENCODED',
    'RATING_CATEGORY_ENCODED',
    'COHERENT_ENCODED',
    'NUM_NAMED_ENTITIES',
    'CAPITAL_CHAR_COUNT',
    'PUNCTUATION_COUNT',
    'PREPROCESSED_REVIEW_TEXT',
    'SENTIMENT_SCORE_TITLE',
    'SENTIMENT_LABEL_TITLE',
    'AVG_RATING_VERIFIED',
    'AVG_RATING_NON_VERIFIED',
    'DEVIATION_NON_VERIFIED',
    'Unnamed: 36',
    'Unnamed: 37',
    'Unnamed: 38',
    ]
'''
dataframe = dataframe.drop(columns = columns_to_drop)


##Data Processing:

In [None]:
dataframe['PROP_NOUNS'] = dataframe['NUM_NOUNS'] / dataframe['WORD_COUNT']
dataframe['PROP_VERBS'] = dataframe['NUM_VERBS'] / dataframe['WORD_COUNT']
dataframe['PROP_ADJECTIVES'] = dataframe['NUM_ADJECTIVES'] / dataframe['WORD_COUNT']
dataframe['PROP_ADVERBS'] = dataframe['NUM_ADVERBS'] / dataframe['WORD_COUNT']

dataframe = dataframe.drop(['NUM_NOUNS'], axis=1)
dataframe = dataframe.drop(['NUM_VERBS'], axis=1)
dataframe = dataframe.drop(['NUM_ADJECTIVES'], axis=1)
dataframe = dataframe.drop(['NUM_ADVERBS'], axis=1)

In [None]:
dataframe

## Data Analysis:

In [None]:
numeric_columns = dataframe.select_dtypes(include=[np.number]) 
sns.heatmap(numeric_columns.corr(), annot = False)

In [None]:
subset_dataframe = dataframe.sample(n=200, random_state=42)

# Use sns.pairplot with the subset DataFrame
sns.pairplot(subset_dataframe, hue = 'LABEL_ENCODED')

# Display the plot
plt.show()

Label_Encoded 0 = Fake Product Review
Label_Encoded 1 = Real Product Review

sentiment score title might be an important feature?

# **Unigram Tokenisation:**

In [None]:
split_row_number = 10500

df_fake = dataframe.iloc[:split_row_number]
df_real = dataframe.iloc[split_row_number:]

def get_unigrams(sentence):
    words = sentence.split()
    return words

all_unigrams_fake = []
all_unigrams_real = []

for sentence in df_fake['REVIEW_TEXT']:
    unigrams = get_unigrams(sentence)
    all_unigrams_fake.extend(unigrams)

for sentence in df_real['REVIEW_TEXT']:
    unigrams = get_unigrams(sentence)
    all_unigrams_real.extend(unigrams)

print("Number of Fake Review Unigrams: ",len(all_unigrams_fake))
print("Number of Real Review Unigrams: ",len(all_unigrams_real))

# List of all unigrams
all_unigrams = []

for sentence in dataframe['REVIEW_TEXT']:
    unigrams = get_unigrams(sentence)
    all_unigrams.extend(unigrams)

print("Total Number of Unique Unigrams: ",len(all_unigrams))

In [None]:
def unigram_frequency(unigram, unigram_list):
    return bigram_list.count(unigram)

from collections import Counter

# Calculate frequencies of unigrams in all_unigrams_fake
unigram_freq_fake = Counter(all_unigrams_fake)

unifrequency_in_fake = {}

# Iterate over all_unigrams and set the frequencies in frequency_in_fake
for unigram in all_unigrams:
    unifrequency_in_fake[unigram] = unigram_freq_fake.get(unigram, 0)

#REAL

# Calculate frequencies of unigrams in all_unigrams_real
unigram_freq_real = Counter(all_unigrams_real)

unifrequency_in_real = {}

# Iterate over all_unigrams and set the frequencies in frequency_in_real
for unigram in all_unigrams:
    unifrequency_in_real[unigram] = unigram_freq_real.get(unigram, 0)


In [None]:
#Printing no. of occurences for 30 unigrams:

from itertools import islice
num_items = 30

first_few_items_fake = dict(islice(unifrequency_in_fake.items(), num_items))

print(first_few_items_fake)

In [None]:
# Calculate how much more often a unigram appears in fake reviews than in real reviews

unigram_differences = {unigram : unifrequency_in_fake[unigram] - unifrequency_in_real.get(unigram, 0) for unigram in unifrequency_in_fake}

num_items = 30
first_few_items_differences = dict(islice(unigram_differences.items(), num_items))
print(first_few_items_differences)

*   Positive Difference = More in Fake Reviews
*   Negative Difference = More in Real Reviews

In [None]:
# Observing Trends in Differences
asc_unigram_differences = dict(sorted(unigram_differences.items(), key=lambda item: item[1], reverse=False))
dsc_unigram_differences = dict(sorted(unigram_differences.items(), key=lambda item: item[1], reverse=True))

first_asc_differences = dict(islice(asc_unigram_differences.items(), num_items-10))
first_dsc_differences = dict(islice(dsc_unigram_differences.items(), num_items-10))

print(first_asc_differences)
print(first_dsc_differences)

### **Unigram Normalisation**:

In [None]:
#There are a total of 686,187 unigrams: Only normalising only those unigrams that appear minimum 250 times
ovr_unifrequency = {}
for unigram in all_unigrams:
    ovr_unifrequency[unigram] = unigram_freq_fake.get(unigram, 0) + unigram_freq_real.get(unigram, 0)

print(dict(islice(ovr_unifrequency.items(), num_items)))

In [None]:
accepted = 0
for index, (unigram, frequency) in enumerate(ovr_unifrequency.items()):
    if frequency >= 250:
        accepted += 1
print("Number of Unigrams that fit the criteria are: ",accepted)

final_unigrams_freq = {unigram: frequency for unigram, frequency in ovr_unifrequency.items() if frequency >= 250}
print(dict(islice(final_unigrams_freq.items(), num_items)))

In [None]:
final_unigrams_diff = {unigram: unifrequency_in_fake[unigram] - unifrequency_in_real.get(unigram, 0) for unigram, frequency in ovr_unifrequency.items() if frequency >= 250}
print(dict(islice(final_unigrams_diff.items(), num_items)))

In [None]:
norm_unigrams = {unigram: final_unigrams_diff.get(unigram, 0) / final_unigrams_freq.get(unigram, 1) for unigram in final_unigrams_freq.keys()}

#norm_unigrams = {key: round(value, 2) for key, value in norm_unigrams.items()}
print(dict(islice(norm_unigrams.items(), num_items)))

### **Histogram**:

In [None]:
#norm_value_counts = Counter(norm_unigrams.values())
#normvalues, counts = zip(*norm_value_counts.items())
#sorted_idx = sorted(range(len(normvalues)), key=lambda k: normvalues[k])
#normvalues = [normvalues[i] for i in sorted_idx]
#counts = [counts[i] for i in sorted_idx]

# Plot the histogram
plt.hist(norm_unigrams.values(), bins = 60, align = 'mid', edgecolor = 'black')

plt.xlabel('Normalized Value')
plt.ylabel('Count')
plt.title('Count of Normalized Values (Unigrams)')

plt.show()

In [None]:
final_norm_unigrams = {unigram: value for unigram, value in norm_unigrams.items() if value <= -0.4 or value >= 0.1}
print(dict(islice(final_norm_unigrams.items(), num_items)))
len(final_norm_unigrams)

18-19 split achieved by <=-0.4 and >=0.1 respectively, fair distribution

In [None]:
df_unigrams = dataframe.copy()

def count_unigram_occurrences(row, desired_unigram):
    text = row['REVIEW_TEXT']
    unigram_counts = Counter(get_unigrams(text))
    desired_value = unigram_counts.get(desired_unigram, 0)
    return desired_value

# Creating columns and applying count_bigram_occurences function
for unigram in final_norm_unigrams:
    df_unigrams[unigram] = df_unigrams.apply(lambda row: count_unigram_occurrences(row, unigram), axis=1)

df_unigrams = df_unigrams.fillna(0)

In [None]:
pd.set_option('display.max_columns', None)

print(df_unigrams.iloc[1754])

# **Bigram Tokenisation**:

In [None]:
#Label_Encoded 0 = Fake Product Review
#Label_Encoded 1 = Real Product Review

# Make a list of bigrams in fake reviews and real reviews
def get_bigrams(sentence):
    words = sentence.split()
    bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]
    return bigrams

all_bigrams_fake = []
all_bigrams_real = []

for sentence in df_fake['REVIEW_TEXT']:
    bigrams = get_bigrams(sentence)
    all_bigrams_fake.extend(bigrams)

for sentence in df_real['REVIEW_TEXT']:
    bigrams = get_bigrams(sentence)
    all_bigrams_real.extend(bigrams)

print("Number of Fake Review Bigrams: ",len(all_bigrams_fake))
print("Number of Real Review Bigrams: ",len(all_bigrams_real))

# List of all bigrams
all_bigrams = []

for sentence in dataframe['REVIEW_TEXT']:
    bigrams = get_bigrams(sentence)
    all_bigrams.extend(bigrams)


In [None]:
def bigram_frequency(bigram, bigram_list):
    return bigram_list.count(bigram)

In [None]:
#FAKE
# Calculate frequencies of bigrams in all_bigrams_fake
bigram_freq_fake = Counter(all_bigrams_fake)

frequency_in_fake = {}

# Iterate over all_bigrams and set the frequencies in frequency_in_fake
for bigram in all_bigrams:
    frequency_in_fake[bigram] = bigram_freq_fake.get(bigram, 0)

#REAL

# Calculate frequencies of bigrams in all_bigrams_real
bigram_freq_real = Counter(all_bigrams_real)

frequency_in_real = {}

# Iterate over all_bigrams and set the frequencies in frequency_in_real
for bigram in all_bigrams:
    frequency_in_real[bigram] = bigram_freq_real.get(bigram, 0)

##Bigram Frequency Output Analysis:

In [None]:
num_items = 30

first_few_items_in_fake = dict(islice(frequency_in_fake.items(), num_items))

print(first_few_items_in_fake)

In [None]:
count_zero_frequency = 0
for index, (bigram, frequency) in enumerate(frequency_in_fake.items()):
    if frequency == 0:
        print(f"Index: {index}, Bigram: {bigram}")
        count_zero_frequency += 1
        if count_zero_frequency == 20:
            break

Frequency_in_fake consists of all bigrams and their occurance in fake reviews, almost all the bigrams with a very high index (181000+) have an occurence of 0.
This is because all_bigrams consisted of fake review bigrams then real review bigrams therefore after this index you have those that should have a good occurence in Frequency_in_real.

In [None]:
from itertools import islice

num_items = 30

first_few_items_in_real = dict(islice(frequency_in_real.items(), num_items))

print(first_few_items_in_real)

## Differences:

In [None]:
# Calculate how much more often a bigram appears in fake reviews than in real reviews

bigram_differences = {bigram : frequency_in_fake[bigram] - frequency_in_real.get(bigram, 0) for bigram in frequency_in_fake}

num_items = 30
first_few_items_in_differences = dict(islice(bigram_differences.items(), num_items))

print(first_few_items_in_differences)

*   Positive Difference = More in Fake Reviews
*   Negative Difference = More in Real Reviews


## Observing Trends:

In [None]:
asc_bigram_differences = dict(sorted(bigram_differences.items(), key=lambda item: item[1], reverse=False))
dsc_bigram_differences = dict(sorted(bigram_differences.items(), key=lambda item: item[1], reverse=True))

num_items = 30
first_few_items_in_asc_differences = dict(islice(asc_bigram_differences.items(), num_items))
first_few_items_in_dsc_differences = dict(islice(dsc_bigram_differences.items(), num_items))

print(first_few_items_in_asc_differences)
print(first_few_items_in_dsc_differences)

# **Normalisation:**

## Filtering Bigrams:

In [None]:
# Continuining with those bigrams that appear atleast 50 times
ovr_frequency = {}
for bigram in all_bigrams:
    ovr_frequency[bigram] = bigram_freq_fake.get(bigram, 0) + bigram_freq_real.get(bigram, 0)

print(dict(islice(ovr_frequency.items(), num_items)))

In [None]:
len(ovr_frequency)

In [None]:
accepted = 0
for index, (bigram, frequency) in enumerate(ovr_frequency.items()):
    if frequency >= 50:
        accepted += 1

print("Number of Bigrams that fit the criteria are: ",accepted)
print(len([val for val in ovr_frequency.values() if val >= 50]))


## Normailisation:

In [None]:
final_bigrams_freq = {bigram: frequency for bigram, frequency in ovr_frequency.items() if frequency >= 50}
print(dict(islice(final_bigrams_freq.items(), num_items)))
print(len(final_bigrams_freq))

In [None]:
final_bigrams_diff = {bigram: frequency_in_fake[bigram] - frequency_in_real.get(bigram, 0) for bigram, frequency in ovr_frequency.items() if frequency >= 50}
print(dict(islice(final_bigrams_diff.items(), num_items)))

*   Positive Difference = More in Fake Reviews
*   Negative Difference = More in Real Reviews

In [None]:
norm_bigrams = {bigram: final_bigrams_diff.get(bigram, 0) / final_bigrams_freq.get(bigram, 1) for bigram in final_bigrams_freq.keys()}

# don't do this; calculate a histogram instead
norm_bigrams = {key: round(value, 2) for key, value in norm_bigrams.items()}
print(dict(islice(norm_bigrams.items(), num_items)))

## Scatterplot:

In [None]:
#norm_value_counts = Counter(norm_bigrams.values())
#norm_values, counts = zip(*norm_value_counts.items())

#sorted_indices = sorted(range(len(norm_values)), key=lambda k: norm_values[k])
#norm_values = [norm_values[i] for i in sorted_indices]
#counts = [counts[i] for i in sorted_indices]

plt.hist(norm_bigrams.values(), bins = 60, align = 'mid', edgecolor = 'black')

plt.xlabel('Normalized Value')
plt.ylabel('Count')
plt.title('Count of Normalized Values (Bigrams)')

plt.show()

## Final Normalised Bigrams:

In [None]:
final_norm_bigrams = {bigram: value for bigram, value in norm_bigrams.items() if  value <= -0.4 or value >= 0.3  }
print(dict(islice(final_norm_bigrams.items(), num_items)))
len(final_norm_bigrams)

**(25 - 20 split at values -0.4 and 0.3)**
*   Positive Difference = More in Fake Reviews
*   Negative Difference = More in Real Reviews

#**Implementing Bigrams:**

In [None]:
df = dataframe.copy()
df_bigrams = dataframe.copy()

In [None]:
def count_bigram_occurrences(row, desired_bigram):
    text = row['REVIEW_TEXT']
    bigram_counts = Counter(get_bigrams(text))
    # Graham: Rewriting line
    #desired_value = bigram_counts.get(bigram, 0) if bigram == desired_bigram else 0
    desired_value = bigram_counts.get(desired_bigram, 0)
    return desired_value

# Creating columns and applying count_bigram_occurences funtion
for bigram in final_norm_bigrams:
    df_bigrams[bigram] = df_bigrams.apply(lambda row: count_bigram_occurrences(row, bigram), axis=1)

df_bigrams = df_bigrams.fillna(0)

In [None]:
pd.set_option('display.max_columns', None)

print(df_bigrams.iloc[349])

# **Model:**

##Imports:

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn import metrics

from keras.models import Sequential, Model
from keras.layers import Conv2D, Dropout, MaxPooling2D, Input
from keras.layers import BatchNormalization, Activation, Flatten, Dense
from tensorflow.keras import initializers

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import RobustScaler


In [None]:
def draw_roc_curve(y_test, y_score, title, c="blue", line_width=1):
  fpr_log_reg, tpr_log_reg, thresholds = metrics.roc_curve(y_test, y_score)
  plt.figure(2)
  aucroc = metrics.auc(fpr_log_reg, tpr_log_reg)
  plt.plot(fpr_log_reg, tpr_log_reg, color=c, lw=line_width, label = 'AUC = %0.3f' % aucroc)
  plt.title(title)
  plt.xlabel('False Positive Rates')
  plt.ylabel('True Positive Rates')
  plt.legend(loc = 'lower right')
  plt.show()
  print()

## Initialisation:

In [None]:
#With Bigram Implementation

features_bg = df_bigrams
features_bg = features_bg.drop(['LABEL_ENCODED'], axis = 1)
labels_bg = df_bigrams['LABEL_ENCODED']

#With Unigram Implementation
features_ug = df_unigrams
features_ug = features_ug.drop(['LABEL_ENCODED'], axis = 1)
labels_ug = df_unigrams['LABEL_ENCODED']

#Without Bigram Implementation
features = df
features = features.drop(['LABEL_ENCODED'], axis = 1)
labels = df['LABEL_ENCODED']

In [None]:
x_train_bg, x_test_bg, y_train_bg, y_test_bg = train_test_split(features_bg, labels_bg, test_size = 0.2, random_state = 42)

x_train_ug, x_test_ug, y_train_ug, y_test_ug = train_test_split(features_ug, labels_ug, test_size = 0.2, random_state = 42)

x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size = 0.2, random_state = 42)

In [None]:
#Vectorize training and test data
vectorizer = CountVectorizer()

x_train_bg_vectorized = vectorizer.fit_transform(x_train_bg['REVIEW_TEXT'])
x_test_bg_vectorized = vectorizer.transform(x_test_bg['REVIEW_TEXT'])

x_train_ug_vectorized = vectorizer.fit_transform(x_train_ug['REVIEW_TEXT'])
x_test_ug_vectorized = vectorizer.transform(x_test_ug['REVIEW_TEXT'])

x_train_vectorized = vectorizer.fit_transform(x_train['REVIEW_TEXT'])
x_test_vectorized = vectorizer.transform(x_test['REVIEW_TEXT'])


## Naive Bayes Model:

In [None]:
#With Bigram Implementation

bayes_model_bg = MultinomialNB()
bayes_model_bg.fit(x_train_bg_vectorized, y_train_bg)
bayes_pred_bg = bayes_model_bg.predict(x_test_bg_vectorized)

#With Unigram Implementation
bayes_model_ug = MultinomialNB()
bayes_model_ug.fit(x_train_ug_vectorized, y_train_ug)
bayes_pred_ug = bayes_model_ug.predict(x_test_ug_vectorized)

#Without Bigram Implementation
bayes_model = MultinomialNB()
bayes_model.fit(x_train_vectorized, y_train)
bayes_pred = bayes_model.predict(x_test_vectorized)

In [None]:
# Metrics

# With Bigram
bayes_accuracy_bg = accuracy_score(y_test_bg, bayes_pred_bg)
bayes_confusion_matrix_bg = confusion_matrix(y_test_bg, bayes_pred_bg)

print("Accuracy (Bigram): ", round((bayes_accuracy_bg*100), 3), "%")
print("Confusion Matrix (Bigram):\n", bayes_confusion_matrix_bg)

# ROC curve
draw_roc_curve(y_test_bg, bayes_pred_bg, "Naive Bayes (Bigram)", c = "blue", line_width = 2)


# With Unigram
bayes_accuracy_ug = accuracy_score(y_test_ug, bayes_pred_ug)
bayes_confusion_matrix_ug = confusion_matrix(y_test_ug, bayes_pred_ug)

print("Accuracy (Unigram): ", round((bayes_accuracy_ug*100), 3), "%")
print("Confusion Matrix (Unigram):\n", bayes_confusion_matrix_ug)

draw_roc_curve(y_test_ug, bayes_pred_ug, "Naive Bayes (Unigram)", c = "blue", line_width = 2)


#Without N-grams
bayes_accuracy = accuracy_score(y_test, bayes_pred)
bayes_confusion_matrix = confusion_matrix(y_test, bayes_pred)

print("Accuracy (No-grams): ", round((bayes_accuracy*100), 3), "%")
print("Confusion Matrix (No-grams):\n", bayes_confusion_matrix)

draw_roc_curve(y_test, bayes_pred, "Naive Bayes (No-grams)", c = "blue", line_width = 2)

In [None]:
# With Bigram Implementation
svm_model_bg = SVC(kernel='rbf') 
svm_model_bg.fit(x_train_bg_vectorized, y_train_bg)
svm_pred_bg = svm_model_bg.predict(x_test_bg_vectorized)
svm_accuracy_bg = accuracy_score(y_test_bg, svm_pred_bg)

# With Unigram Implementation
svm_model_ug = SVC(kernel='rbf') 
svm_model_ug.fit(x_train_ug_vectorized, y_train_ug)
svm_pred_ug = svm_model_ug.predict(x_test_ug_vectorized)
svm_accuracy_ug = accuracy_score(y_test_ug, svm_pred_ug)

# Without Bigram Implementation
svm_model = SVC(kernel='rbf')
svm_model.fit(x_train_vectorized, y_train)
svm_pred = svm_model.predict(x_test_vectorized)
svm_accuracy = accuracy_score(y_test, svm_pred)

print("SVM Accuracy (Bigram):", svm_accuracy_bg)
print("SVM Accuracy (Unigram):", svm_accuracy_ug)
print("SVM Accuracy (Without Bigram):", svm_accuracy)

#**Neural Network:**

In [None]:
x_train_bg_array = x_train_bg_vectorized.toarray()
x_test_bg_array = x_test_bg_vectorized.toarray()

# With Bigram I
model_bg = Sequential()
model_bg.add(Dense(256, activation='relu', input_shape=(x_train_bg_array.shape[1],)))
model_bg.add(Dense(128, activation='relu'))
model_bg.add(Dense(128, activation='relu'))
model_bg.add(Dense(64, activation='relu'))
model_bg.add(Dense(1, activation='sigmoid'))

model_bg.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

model_bg.fit(x_train_bg_array, y_train_bg, epochs = 3, batch_size = 32)

#Metrics
loss_bg , accuracy_bg = model_bg.evaluate(x_test_bg_array, y_test_bg)
print("Test Accuracy: ", round((accuracy_bg*100), 3), "%")


In [None]:
x_train_ug_array = x_train_ug_vectorized.toarray()
x_test_ug_array = x_test_ug_vectorized.toarray()

# With Unigram I
model_ug = Sequential()
model_ug.add(Dense(256, activation='relu', input_shape=(x_train_ug_array.shape[1],)))
model_ug.add(Dense(128, activation='relu'))
model_ug.add(Dense(128, activation='relu'))
model_ug.add(Dense(64, activation='relu'))
model_ug.add(Dense(1, activation='sigmoid'))

model_ug.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

model_ug.fit(x_train_ug_array, y_train_ug, epochs = 3, batch_size = 32)

#Metrics
loss_ug , accuracy_ug = model_ug.evaluate(x_test_ug_array, y_test_ug)
print("Test Accuracy: ", round((accuracy_ug*100), 3), "%")


In [None]:
x_train_array = x_train_vectorized.toarray()
x_test_array = x_test_vectorized.toarray()

#Without N-gram
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(x_train_array.shape[1],)))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

model.fit(x_train_array, y_train, epochs = 3, batch_size = 32)

# Metrics
loss, accuracy = model.evaluate(x_test_array, y_test)
print("Test Accuracy: ", round((accuracy*100), 3), "%")

#Check: