In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings, string
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Load and preprocess dataset
df = pd.read_csv('fake reviews dataset.csv')
# df.drop('Unnamed: 0', axis=1, inplace=True)
df.dropna(inplace=True)

# Add text length feature
df['length'] = df['text_'].apply(len)

# Exploratory data analysis
plt.hist(df['length'], bins=50)
plt.show()

df.groupby('label').describe()

df.hist(column='length', by='label', bins=50, color='blue', figsize=(12,5))
plt.show()

# Display longest original review
longest_review = df[df['label']=='OR'][['text_','length']].sort_values(by='length', ascending=False).head().iloc[0].text_
print(longest_review)

print(df.length.describe())

# Text preprocessing function
def text_process(review):
    nopunc = [char for char in review if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

# Bag of Words transformation
bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer.fit(df['text_'])
print("Total Vocabulary:", len(bow_transformer.vocabulary_))

# Example transformation
review4 = df['text_'][3]
bow_msg4 = bow_transformer.transform([review4])
print(bow_msg4)
print(bow_msg4.shape)

# Display feature names using updated method
feature_names = bow_transformer.get_feature_names_out()
print(feature_names[15841])
print(feature_names[23848])

# Transform all reviews
bow_reviews = bow_transformer.transform(df['text_'])
print("Shape of Bag of Words Transformer for the entire reviews corpus:", bow_reviews.shape)
print("Amount of non zero values in the bag of words model:", bow_reviews.nnz)
print("Sparsity:", np.round((bow_reviews.nnz/(bow_reviews.shape[0]*bow_reviews.shape[1]))*100, 2))

# TF-IDF transformation
tfidf_transformer = TfidfTransformer().fit(bow_reviews)
tfidf_rev4 = tfidf_transformer.transform(bow_msg4)
print(tfidf_rev4)

# Display IDF values
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['mango']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['book']])

tfidf_reviews = tfidf_transformer.transform(bow_reviews)
print("Shape:", tfidf_reviews.shape)
print("No. of Dimensions:", tfidf_reviews.ndim)

# Split data into training and testing sets
review_train, review_test, label_train, label_test = train_test_split(df['text_'], df['label'], test_size=0.35, random_state=42)

# Multinomial Naive Bayes Pipeline
pipeline_nb = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

pipeline_nb.fit(review_train, label_train)
predictions_nb = pipeline_nb.predict(review_test)
print('Multinomial Naive Bayes Results:')
print('Classification Report:', classification_report(label_test, predictions_nb))
print('Confusion Matrix:', confusion_matrix(label_test, predictions_nb))
print('Accuracy Score:', accuracy_score(label_test, predictions_nb))
print('Model Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_nb)*100, 2)) + '%')
print('\n')

# Random Forest Classifier Pipeline
pipeline_rf = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', RandomForestClassifier(random_state=42))
])

pipeline_rf.fit(review_train, label_train)
predictions_rf = pipeline_rf.predict(review_test)
print('Random Forest Classifier Results:')
print('Classification Report:', classification_report(label_test, predictions_rf))
print('Confusion Matrix:', confusion_matrix(label_test, predictions_rf))
print('Accuracy Score:', accuracy_score(label_test, predictions_rf))
print('Model Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_rf)*100, 2)) + '%')
print('\n')

# Decision Tree Classifier Pipeline
pipeline_dt = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

pipeline_dt.fit(review_train, label_train)
predictions_dt = pipeline_dt.predict(review_test)
print('Decision Tree Classifier Results:')
print('Classification Report:', classification_report(label_test, predictions_dt))
print('Confusion Matrix:', confusion_matrix(label_test, predictions_dt))
print('Accuracy Score:', accuracy_score(label_test, predictions_dt))
print('Model Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_dt)*100, 2)) + '%')
print('\n')

# K-Nearest Neighbors Pipeline
pipeline_knn = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

pipeline_knn.fit(review_train, label_train)
predictions_knn = pipeline_knn.predict(review_test)
print('K Nearest Neighbors Classifier Results:')
print('Classification Report:', classification_report(label_test, predictions_knn))
print('Confusion Matrix:', confusion_matrix(label_test, predictions_knn))
print('Accuracy Score:', accuracy_score(label_test, predictions_knn))
print('Model Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_knn)*100, 2)) + '%')
print('\n')

# Support Vector Machine Pipeline
pipeline_svc = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', SVC(random_state=42))
])

pipeline_svc.fit(review_train, label_train)
predictions_svc = pipeline_svc.predict(review_test)
print('Support Vector Machines Results:')
print('Classification Report:', classification_report(label_test, predictions_svc))
print('Confusion Matrix:', confusion_matrix(label_test, predictions_svc))
print('Accuracy Score:', accuracy_score(label_test, predictions_svc))
print('Model Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_svc)*100, 2)) + '%')
print('\n')

# Logistic Regression Pipeline
pipeline_lr = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression(random_state=42))
])

pipeline_lr.fit(review_train, label_train)
predictions_lr = pipeline_lr.predict(review_test)
print('Logistic Regression Results:')
print('Classification Report:', classification_report(label_test, predictions_lr))
print('Confusion Matrix:', confusion_matrix(label_test, predictions_lr))
print('Accuracy Score:', accuracy_score(label_test, predictions_lr))
print('Model Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_lr)*100, 2)) + '%')
print('\n')

# Conclusion
print('Performance of various ML models:')
print('\n')
print('Logistic Regression Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_lr)*100, 2)) + '%')
print('K Nearest Neighbors Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_knn)*100, 2)) + '%')
print('Decision Tree Classifier Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_dt)*100, 2)) + '%')
print('Random Forests Classifier Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_rf)*100, 2)) + '%')
print('Support Vector Machines Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_svc)*100, 2)) + '%')
print('Multinomial Naive Bayes Prediction Accuracy:', str(np.round(accuracy_score(label_test, predictions_nb)*100, 2)) + '%')