In [None]:
############# Prepping dataset #############

In [None]:
#Importing libraries prepping data
import pandas as pd
import numpy as np
import seaborn as sns
import re
import matplotlib.pyplot as plt
from sklearn.utils import resample

In [None]:
#Importing data
data = pd.read_csv("C:/Users/Frank/Downloads/Thesis/Datasets/Reviews.csv")

In [None]:
#Removing neutral variable
value_to_remove = 3
data = data[data['Score'] != value_to_remove]

In [None]:
#Adding Sentiment variable
def categorize_score(score):
    if score in [1, 2]:
        return 'negative'
    elif score in [4, 5]:
        return 'positive'
    else:
        return 'Missing' 

data['Sentiment'] = data['Score'].apply(categorize_score)

In [None]:
#Creating a more balanced dataset
positive_class = data[data['Sentiment'] == 'positive']
negative_class = data[data['Sentiment'] == 'negative']

undersampled_positive_class = resample(positive_class, 
                                      replace=False, 
                                      n_samples=len(negative_class), 
                                      random_state=50)

undersampled_data = pd.concat([undersampled_positive_class, negative_class])
Balanced_data = undersampled_data.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
#Checking Balanced dataset numbers
class_counts = Balanced_data['Sentiment'].value_counts()
print(class_counts)

In [None]:
#Visualizing datasets
plt.subplot(1, 2, 1)
data['Sentiment'].value_counts().plot(kind='bar', color=['blue', 'orange'])
plt.title('Original Class Distribution (a)')

plt.subplot(1, 2, 2)
Balanced_data['Sentiment'].value_counts().plot(kind='bar', color=['blue', 'orange'])
plt.title('Balanced Class Distribution (b)')

plt.subplots_adjust(wspace=1)
plt.show()
print(Balanced_data["Sentiment"])

In [None]:
############# Cleaning text #############

In [None]:
#Cleaning & Lowercasing text
#Lowercasing
data["Text"] = data["Text"].str.lower()

#Remove punctuation
def remove_punctuation(text):
    punctuation_pattern = r'[^\w\s]'   
    return re.sub(punctuation_pattern, "", text)
data["Text"] = data["Text"].apply(remove_punctuation)

#Remove URL
def remove_urls(text):
    url_pattern = r'http[s]?://\S+|www\.\S+'
    return re.sub(url_pattern, '', text)
data['Text'] = data['Text'].apply(remove_urls)


In [None]:
#Importing stopwords
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

In [None]:
#Remove stopwords
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords.words('english')]
    return ' '.join(filtered_words)
data['Text'] = data['Text'].apply(remove_stopwords)

#Remove extra spaces
def remove_extra_spaces(text):
    return re.sub(r'\s+', ' ', text).strip()
data['Text'] = data['Text'].apply(remove_extra_spaces)

In [None]:
############# Splitting data #############

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = Balanced_data["Text"]
y = Balanced_data["Sentiment"]

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=50)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=50)

In [None]:
############# KNN - BOW - Hyperparameter tuning #############

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
import sklearn
import notebook
from sklearn.metrics import confusion_matrix

In [None]:
#Create pipeline BOW
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', KNeighborsClassifier()) 
])

In [None]:
#Parameter grid n-gram optimization
param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
    'classifier__n_neighbors': [9, 11, 13] 
}

In [None]:
#Grid search on validation set
f1_scorer = make_scorer(f1_score, pos_label="positive")
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=1, scoring=f1_scorer)
grid_search.fit(X_val, y_val)

In [None]:
#Finding optimal n-gram range
optimal_ngram = grid_search.best_params_['vectorizer__ngram_range']
optimal_n_neighbors = grid_search.best_params_['classifier__n_neighbors']
print(f"Optimal n-gram: {optimal_ngram}")
print(f"Optimal n_neighbors: {optimal_n_neighbors}")

In [None]:
#Specific F1-score
cv_results = grid_search.cv_results_
f1_scores = cv_results['mean_test_score']
param_combinations = cv_results['params']
for params, f1 in zip(param_combinations, f1_scores):
    print(f"F1-score for {params}: {f1}")

In [None]:
############# KNN - BOW - Test set #############

In [None]:
#Creating the best BOW KNN model - test data
bow_vectorizer = CountVectorizer(ngram_range=(1, 1)) #Hier optimal n-gram range invullen
X_train_vectorized = bow_vectorizer.fit_transform(X_train)
X_test_vectorized = bow_vectorizer.transform(X_test)

knn_model = KNeighborsClassifier(n_neighbors=9) #Hier optimal n_neighbors invullen
knn_model.fit(X_train_vectorized, y_train)
y_pred_test = knn_model.predict(X_test_vectorized)

classification_rep = classification_report(y_test, y_pred_test)
print("Classification Report:\n", classification_rep)

conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
############# KNN - TFIDF - Hyperparameter tuning #############

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Create pipeline TF-IDF
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', KNeighborsClassifier()) 
])

In [None]:
#Parameter grid n-gram optimization
param_grid = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
    'classifier__n_neighbors': [1, 3, 5, 7, 9, 11, 13] 
}

In [None]:
#Grid search on validation set
f1_scorer = make_scorer(f1_score, pos_label="positive")
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=1, scoring=f1_scorer)
grid_search.fit(X_val, y_val)

In [None]:
#Finding optimal n-gram range
optimal_ngram = grid_search.best_params_['vectorizer__ngram_range']
optimal_n_neighbors = grid_search.best_params_['classifier__n_neighbors']
print(f"Optimal n-gram: {optimal_ngram}")
print(f"Optimal n_neighbors: {optimal_n_neighbors}")

In [None]:
#Specific F1-score
cv_results = grid_search.cv_results_
f1_scores = cv_results['mean_test_score']
param_combinations = cv_results['params']
for params, f1 in zip(param_combinations, f1_scores):
    print(f"F1-score for {params}: {f1}")

In [None]:
############# KNN - TFIDF - Test set #############

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 3)) #Hier optimal n-gram range invullen
X_train_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_vectorized = tfidf_vectorizer.transform(X_test)

knn_model = KNeighborsClassifier(n_neighbors=13) #Hier optimal n_neighbors invullen
knn_model.fit(X_train_vectorized, y_train)
y_pred_test = knn_model.predict(X_test_vectorized)

classification_rep = classification_report(y_test, y_pred_test)
print("Classification Report:\n", classification_rep)

conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
############# RF - BOW - Hyperparameter tuning #############

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Create pipeline 
pipeline_bow_rf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', RandomForestClassifier()) 
])

In [None]:
#Parameter grid n-gram optimization
param_grid_rf = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
    'classifier__n_estimators': [50, 100, 200,300], 
    'classifier__max_depth': [5, 10, 20,30], 
    'classifier__min_samples_split': [2, 5, 10, 15],
}

In [None]:
#Training the model on training data
pipeline_bow_rf.fit(X_train, y_train)

In [None]:
#Grid search on validation set
f1_scorer = make_scorer(f1_score, pos_label="positive")
grid_search = GridSearchCV(pipeline_bow_rf, param_grid_rf, cv=5, n_jobs=1, scoring=f1_scorer)
grid_search.fit(X_val, y_val)

In [None]:
#Finding optimal n-gram range
optimal_ngram = grid_search.best_params_['vectorizer__ngram_range']
print(f"Optimal n-gram: {optimal_ngram}")

optimal_params = grid_search.best_params_
print("Optimal parameters:")
for param, value in optimal_params.items():
    print(f"{param}: {value}")

In [None]:
#Specific F1-score (check)
cv_results = grid_search.cv_results_
f1_scores = cv_results['mean_test_score']
param_combinations = cv_results['params']
for params, f1 in zip(param_combinations, f1_scores):
    print(f"F1-score for {params}: {f1}")

In [None]:
############# RF - BOW - Test set #############

In [None]:
bow_vectorizer = CountVectorizer(ngram_range=(1, 2)) #Hier optimal n-gram range invullen
X_train_vectorized = bow_vectorizer.fit_transform(X_train)
X_test_vectorized = bow_vectorizer.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_split=2) #Hier optimal values invullen
rf_model.fit(X_train_vectorized, y_train)
y_pred_test = rf_model.predict(X_test_vectorized)

classification_rep = classification_report(y_test, y_pred_test)
print("Classification Report:\n", classification_rep)

conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
############# RF - TFIDF - Hyperparameter tuning #############

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Create pipeline TF-IDF
pipeline_tfidf_rf = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier()) 
])

In [None]:
#Parameter grid n-gram optimization
param_grid_rf = {
    'vectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 2), (2, 3), (3, 3)],
    'classifier__n_estimators': [50, 100, 200,300], 
    'classifier__max_depth': [5, 10, 20,30], 
    'classifier__min_samples_split': [2, 5, 10, 15],
}

In [None]:
#Training the model on training data
pipeline_tfidf_rf.fit(X_train, y_train)

In [None]:
#Grid search on validation set
f1_scorer = make_scorer(f1_score, pos_label="positive")
grid_search = GridSearchCV(pipeline_tfidf_rf, param_grid_rf, cv=5, n_jobs=1, scoring=f1_scorer)
grid_search.fit(X_val, y_val)

In [None]:
#Finding optimal n-gram range
optimal_ngram = grid_search.best_params_['vectorizer__ngram_range']
print(f"Optimal n-gram: {optimal_ngram}")

optimal_params = grid_search.best_params_
print("Optimal parameters:")
for param, value in optimal_params.items():
    print(f"{param}: {value}")

In [None]:
#Specific F1-score (check)
cv_results = grid_search.cv_results_
f1_scores = cv_results['mean_test_score']
param_combinations = cv_results['params']
for params, f1 in zip(param_combinations, f1_scores):
    print(f"F1-score for {params}: {f1}")

In [None]:
############# RF - TFIDF - Test set #############

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2)) #Hier optimal n-gram range invullen
X_train_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_vectorized = tfidf_vectorizer.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=300, max_depth=30, min_samples_split=2) #Hier optimal values invullen
rf_model.fit(X_train_vectorized, y_train)
y_pred_test = rf_model.predict(X_test_vectorized)

classification_rep = classification_report(y_test, y_pred_test)
print("Classification Report:\n", classification_rep)

conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:\n", conf_matrix)

In [None]:
############# RNN - BOW - Hyperparameter tuning #############

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [None]:
# Create a CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2))

# Fit and transform the documents
count_matrix = vectorizer.fit_transform(documents)

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Construct the sequential Count vectors
sequential_count_vectors = []
for i in range(len(documents)):
    document_vector = count_matrix[i].toarray().flatten()
    sequential_count_vector = [document_vector[feature_names.index(term)] for term in documents[i].split()]
    sequential_count_vectors.append(sequential_count_vector)

In [None]:
# Initialize and fit countvectorizer on text data
count_vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2)) #Aanpassen hyperparameter tuning
X_train_count = count_vectorizer.fit_transform(X_train)
X_val_count = count_vectorizer.transform(X_val)
X_test_count = count_vectorizer.transform(X_test)

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

In [None]:
# Convert text data to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
# Pad sequences to a fixed length
maxlen = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen)
X_val_padded = pad_sequences(X_val_seq, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_seq, maxlen=maxlen)

In [None]:
# Convert string labels to numerical labels
y_train_numeric = y_train.map({'positive': 1, 'negative': 0})
y_val_numeric = y_val.map({'positive': 1, 'negative': 0})
y_test_numeric = y_test.map({'positive': 1, 'negative': 0})

In [None]:
# Define the Keras model function
def create_model(units=50, activation='relu', output_dim=50, learning_rate=0.001, dropout_rate=0.2):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=output_dim, input_length=maxlen))
    model.add(SimpleRNN(units=units, activation=activation))
    model.add(Dropout(rate=dropout_rate)) 
    model.add(Dense(units=1, activation='sigmoid'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Create the KerasClassifier for GridSearchCV
model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=32, verbose=1)

In [None]:
# Define the hyperparameters and their ranges
param_grid = {
    'units': [50, 100, 150],
    'output_dim': [50, 100, 200],
    'learning_rate': [0.001, 0.01, 0.1],
    'dropout_rate': [0.2, 0.5, 0.7], 
}


In [None]:
# Perform GridSearchCV
f1_scorer = make_scorer(f1_score, pos_label=1)

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, cv=2)
grid_result = grid.fit(X_train_padded, y_train_numeric)

In [None]:
# Check best parameters
Best params
print("Best Parameters based on F1 score: ", grid_result.best_params_)

In [None]:
# Specific F1-score (check)
cv_results = grid_result.cv_results_
f1_scores = cv_results['mean_test_score']
param_combinations = cv_results['params']
for params, f1 in zip(param_combinations, f1_scores):
    print(f"F1-score for {params}: {f1}")

In [None]:
############# RNN - BOW - Test #############

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Input paramaters to train the model
manual_params = {
    'units': 150,
    'output_dim': 200,
    'learning_rate': 0.001,
    'dropout_rate': 0.5
}

model_test = create_model(units=manual_params['units'],
                             output_dim=manual_params['output_dim'],
                             learning_rate=manual_params['learning_rate'],
                             dropout_rate=manual_params['dropout_rate'])

In [None]:
# Training the model
model_test.fit(X_train_padded, y_train_numeric, epochs=5, batch_size=32, verbose=1)

In [None]:
# Evaluate the model on the test set
y_pred_manual = model_test.predict(X_test_padded)

In [None]:
# Convert predicted probabilities to binary labels
y_pred_binary_manual = (y_pred_manual > 0.5).astype(int)

In [None]:
# Print classification report
print("Classification Report:")
print(classification_report(y_test_numeric, y_pred_binary_manual))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_numeric, y_pred_binary_manual))

In [None]:
############# RNN - TFIDF - Hyperparameter tuning #############

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

In [None]:
# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Construct the sequential TF-IDF vectors
sequential_tfidf_vectors = []
for i in range(len(documents)):
    document_vector = tfidf_matrix[i].toarray().flatten()
    sequential_tfidf_vector = [document_vector[feature_names.index(term)] for term in documents[i].split()]
    sequential_tfidf_vectors.append(sequential_tfidf_vector)

In [None]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2)) #aanpassen

# Fit and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents)

# Get the feature names (terms)
feature_names = vectorizer.get_feature_names_out()

# Construct the sequential TF-IDF vectors
sequential_tfidf_vectors = []
for i in range(len(documents)):
    document_vector = tfidf_matrix[i].toarray().flatten()
    sequential_tfidf_vector = [document_vector[feature_names.index(term)] for term in documents[i].split()]
    sequential_tfidf_vectors.append(sequential_tfidf_vector)


In [None]:
# Initialize and fit TF-IDF vectorizer on text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2)) #n-gram aanpassen
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train) #aanpassen naar vectorization method

In [None]:
# Convert text data to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
# Pad sequences to a fixed length
maxlen = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen)
X_val_padded = pad_sequences(X_val_seq, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_seq, maxlen=maxlen)

In [None]:
# Convert string labels to numerical labels
y_train_numeric = y_train.map({'positive': 1, 'negative': 0})
y_val_numeric = y_val.map({'positive': 1, 'negative': 0})
y_test_numeric = y_test.map({'positive': 1, 'negative': 0})

In [None]:
# Define the Keras model function
def create_model(units=50, activation='relu', output_dim=50, learning_rate=0.001, dropout_rate=0.2):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=output_dim, input_length=maxlen))
    model.add(SimpleRNN(units=units, activation=activation))
    model.add(Dropout(rate=dropout_rate)) 
    model.add(Dense(units=1, activation='sigmoid'))
    optimizer = Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Create the KerasClassifier for GridSearchCV
model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=32, verbose=1)

In [None]:
# Define the hyperparameters and their ranges
param_grid = {
    'units': [50, 100, 150],
    'output_dim': [50, 100, 200],
    'learning_rate': [0.001, 0.01, 0.1],
    'dropout_rate': [0.2, 0.5, 0.7], 
}


In [None]:
# Perform GridSearchCV
f1_scorer = make_scorer(f1_score, pos_label=1)

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=f1_scorer, cv=2)
grid_result = grid.fit(X_train_padded, y_train_numeric)


In [None]:
# Check best parameters
print("Best Parameters based on F1 score: ", grid_result.best_params_)

In [None]:
# Specific F1-score (check)
cv_results = grid_result.cv_results_
f1_scores = cv_results['mean_test_score']
param_combinations = cv_results['params']
for params, f1 in zip(param_combinations, f1_scores):
    print(f"F1-score for {params}: {f1}")

In [None]:
############# RNN - TFIDF - Test #############

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
# Input paramaters to train the model
manual_params = {
    'units': 150,
    'output_dim': 200,
    'learning_rate': 0.001,
    'dropout_rate': 0.5
}

model_test = create_model(units=manual_params['units'],
                             output_dim=manual_params['output_dim'],
                             learning_rate=manual_params['learning_rate'],
                             dropout_rate=manual_params['dropout_rate'])

In [None]:
# Training the model
model_test.fit(X_train_padded, y_train_numeric, epochs=5, batch_size=32, verbose=1)

In [None]:
# Evaluate the model on the test set
y_pred_manual = model_test.predict(X_test_padded)

In [None]:
# Convert predicted probabilities to binary labels
y_pred_binary_manual = (y_pred_manual > 0.5).astype(int)

In [None]:
# Print classification report
print("Classification Report:")
print(classification_report(y_test_numeric, y_pred_binary_manual))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test_numeric, y_pred_binary_manual))