In [None]:
####################Naive Bayes

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
import time
import re
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# read dataset
data = pd.read_csv("training.300000.processed.noemoticon.csv")

# Download the stopwords if necessary
nltk.download('stopwords')

# Load the stopwords
stop_words = set(stopwords.words('english'))

# Define function to remove HTML tags
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Define function to remove non-alphabet characters and convert to lowercase
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    return text

# Define function to remove stop words
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

# Apply preprocessing steps
data['text'] = data['text'].apply(remove_html_tags)
data['text'] = data['text'].apply(clean_text)
data['text'] = data['text'].str.split()
data['text'] = data['text'].apply(remove_stopwords)

# Print preprocessed data
print(data['text'])

# Rejoin preprocessed text data into a single string
data['text'] = data['text'].apply(lambda x: ' '.join(x))

# Divide Features and Labels
X = data["text"]
y = data["sentiment"]

# Convert to vector
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# Divide training set and test set
X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.2, random_state=42)

# Define a Naive Bayes Classifier
classifier_NaiveBayes = MultinomialNB()

# Define the hyperparameter space
param_grid_NaiveBayes = {
    'alpha': [0.1, 0.5, 1, 2, 5, 10],
    'fit_prior': [True, False]
}

# Perform hyperparameter search and cross-validation
grid_search = GridSearchCV(classifier_NaiveBayes, param_grid_NaiveBayes, cv=5)

# Get program start time
start_time = time.time()

grid_search.fit(X_train, y_train)

# Get program end time
end_time = time.time()

# Calculate program execution time
execution_time = end_time - start_time

# Output the best parameter combination and run time
print("Best Hyperparameters:", grid_search.best_params_)
print("execution time:", execution_time, "s")

# predict
y_pred = grid_search.predict(X_test)

# Calculate accuracy, precision, recall, confusion matrix
accuracy_nb = accuracy_score(y_test, y_pred)
precision_nb = precision_score(y_test, y_pred, average='weighted')
recall_nb = recall_score(y_test, y_pred, average='weighted')
confusion_mat = confusion_matrix(y_test, y_pred)

# output result
print("Accuracy: ", accuracy_nb)
print("Precision: ", precision_nb)
print("Recall: ", recall_nb)
print("Confusion Matrix:")
print(confusion_mat)

####################KNN

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import time
import re
import nltk
from nltk.corpus import stopwords
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD

# read dataset
data = pd.read_csv("training.300000.processed.noemoticon.csv")

# Download the stopwords if necessary
nltk.download('stopwords')

# Load the stopwords
stop_words = set(stopwords.words('english'))

# Define function to remove HTML tags
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Define function to remove non-alphabet characters and convert to lowercase
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()
    return text

# Define function to remove stop words
def remove_stopwords(text):
    return [word for word in text if word not in stop_words]

# Apply preprocessing steps
data['text'] = data['text'].apply(remove_html_tags)
data['text'] = data['text'].apply(clean_text)
data['text'] = data['text'].str.split()
data['text'] = data['text'].apply(remove_stopwords)

# Rejoin preprocessed text data into a single string
data['text'] = data['text'].apply(lambda x: ' '.join(x))

# Divide Features and Labels
X = data["text"]
y = data["sentiment"]

# Convert text data to vector representation
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

#Dimensionality reduction
svd = TruncatedSVD(n_components=50)
X_reduced = svd.fit_transform(X_vectorized)

#Divide training set and test set
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Define a KNN classifier
knn = KNeighborsClassifier()

# Define the hyperparameter space
param_grid = {'n_neighbors': [3, 5, 7]}  # Define the parameter grid for grid search

# Perform hyperparameter search and cross-validation
grid_search = GridSearchCV(knn, param_grid, cv=5)

# Get program start time
start_time = time.time()

grid_search.fit(X_train, y_train)

# Get program end time
end_time = time.time()

# Calculate program execution time
execution_time = end_time - start_time

# Output the best combination of hyperparameters and program execution time
print("Best Hyperparameters:", grid_search.best_params_)
print("execution time:", execution_time, "s")

# predict
y_pred = grid_search.predict(X_test)

# Calculate accuracy, precision, recall, confusion matrix
accuracy_knn = accuracy_score(y_test, y_pred)
precision_knn = precision_score(y_test, y_pred, average='weighted')
recall_knn = recall_score(y_test, y_pred, average='weighted')
confusion_mat = confusion_matrix(y_test, y_pred)

# output result
print("Accuracy: ", accuracy_knn)
print("Precision: ", precision_knn)
print("Recall: ", recall_knn)
print("Confusion Matrix:")
print(confusion_mat)

####################CNN

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import time

# read dataset
data = pd.read_csv("training.300000.processed.noemoticon.csv", encoding="latin-1", header=None)

# Select the text and sentiment columns
text = data[5].values
sentiment = data[0].values

# Map sentiment values to two labels: 0 = negative, 1 = positive
sentiment = np.where(sentiment == 0, 0, 1)

# Split the text data into training and testing sets
train_text, test_text, train_sentiment, test_sentiment = train_test_split(text, sentiment, test_size=0.2, random_state=42)

# Create a tokenizer
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_text)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_text)
test_sequences = tokenizer.texts_to_sequences(test_text)

# Pad sequences to have the same length
max_length = 100
train_data = pad_sequences(train_sequences, maxlen=max_length)
test_data = pad_sequences(test_sequences, maxlen=max_length)

# Define the hyperparameters and their possible values
param_grid = {
    'filters': [64, 128],
    'units': [32, 64]
}

# Create the model to be tuned
def create_model(filters, units):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=100, input_length=max_length))
    model.add(Conv1D(filters=filters, kernel_size=5, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(units=units, activation='relu'))
    model.add(Dense(units=2, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Create the KerasClassifier
model = KerasClassifier(build_fn=create_model)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=model,
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=3)

# Start the timer
start_time = time.time()

# Perform the grid search
grid_result = grid_search.fit(train_data, train_sentiment)

# Calculate the elapsed time
elapsed_time = time.time() - start_time

# Print the best hyperparameters found
print("Best Hyperparameters: ", grid_result.best_params_)

# Print the elapsed time
print("Elapsed Time:", elapsed_time, "seconds")

# Get the best model
best_model = grid_result.best_estimator_.model

# Evaluate the best model on the test set
test_predictions = np.argmax(best_model.predict(test_data), axis=-1)
accuracy_cnn = accuracy_score(test_sentiment, test_predictions)
precision_cnn = precision_score(test_sentiment, test_predictions, average='weighted')
recall_cnn = recall_score(test_sentiment, test_predictions, average='weighted')
confusion_mat = confusion_matrix(test_sentiment, test_predictions)

#output result
print("Test Accuracy:", accuracy_cnn)
print("Precision:", precision_cnn)
print("Recall:", recall_cnn)
print("Confusion Matrix:")
print(confusion_mat)


# Create a dictionary to store the model results
results = {
    'Model': ['Naive Bayes', 'KNN', 'CNN'],
    'Accuracy': [accuracy_nb, accuracy_knn, accuracy_cnn],
    'Precision': [precision_nb, precision_knn, precision_cnn],
    'Recall': [recall_nb, recall_knn, recall_cnn]
}

# Create a DataFrame from the results dictionary
df_results = pd.DataFrame(results)

# Set the 'Model' column as the index
df_results.set_index('Model', inplace=True)

# Display the DataFrame
print(df_results)
