In [None]:
# Import libraries
import pandas as pd
import numpy as np
import tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import TweetTokenizer

In [None]:
# Google Colab setups
# Import Google Colab libraries
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Set up
auth.authenticate_user()
google_authentication = GoogleAuth()
google_authentication.credentials = GoogleCredentials.get_application_default()
drive_access = GoogleDrive(google_authentication)

dataset_file_shared_link = "1bELRhCpbqx8WwV-WZJJgv8NMmEPZNLzr" # shared link id to dataset file saved in google drive
dataset_filename = "training.300000.processed.noemoticon.csv" # dataset file name
access_file = drive_access.CreateFile({'id':dataset_file_shared_link})
access_file.GetContentFile(dataset_filename)

# THIS IS THE CODE AUTO SAVING FILES BY GOOGLE 
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Download the required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Load the dataset
data = pd.read_csv('training.300000.processed.noemoticon.csv', 
                   encoding='latin-1', 
                   header=None, 
                   names=['sentiment', 'id', 'date', 'query', 'username', 'text'], 
                   error_bad_lines=False, skiprows=1)



  data = pd.read_csv('training.300000.processed.noemoticon.csv',


In [None]:
# Preprocess the text

# Reference variable for tweet tokenizer
tweet_tokenizer = TweetTokenizer()

# Text preprocessing function
def preprocess_text(text):
  text = text.lower()  # Convert text to lowercase
  text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove mentions in the text
  text = re.sub(r'https?://[A-Za-z0-9./]+', '', text, flags=re.MULTILINE)  # Remove URLs in the text 
  text = re.sub(r'[^A-Za-z0-9]+', ' ', text)  # Remove special characters and numbers in the text 
  text = re.sub(r'[?\$\.\!](>=<*&%)', ' ', text) # Remove special characters in the text 
  text = re.sub(r'\d+', ' ', text).strip() # Remove extra spaces and remove whitespace from the start and at the end of the text
  tokens = tweet_tokenizer.tokenize(text) # Tokenize the text through a tweet tokenizer
  text = [WordNetLemmatizer().lemmatize(word) for word in tokens if word not in stopwords.words('english')]  # Lemmatize the text and remove stopwords
  return ' '.join(text).strip() # Return preprocessed text

data["text"] = data["text"].apply(preprocess_text) # Apply preprocessing function to the dataset

# Save the cleaned data to a CSV file in the Google Drive folder
data.to_csv('cleaned_data.csv', index=False)

In [None]:
# Split the dataset into training and testing sets with text as independent variable, sentiment as dependent variable, and a random state of 42
X_train, X_test, y_train, y_test = train_test_split(data['text'], 
                                                    data['sentiment'], 
                                                    test_size=0.2, 
                                                    random_state=42)

# Vectorize the text data using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer() # Reference variable for tfidf vectorizer
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train) # Transform training data using a tfidf vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test) # Transform test data using a tfidf vectorizer

In [None]:
# Naive Bayes
naive_bayes_classifier = MultinomialNB() # Reference variable for Multinomial Naive Bayes model
naive_bayes_classifier.fit(X_train_tfidf, y_train) # Fit the training data
y_pred_NB = naive_bayes_classifier.predict(X_test_tfidf) # Multinomial Naive Bayes model prediction

# Print Naive Bayes results
print("Naive Bayes: ") # Print the title
print("Accuracy:", accuracy_score(y_test, y_pred_NB)) # Print accuracy score of the Multinomial Naive Bayes model
print("Precision:", precision_score(y_test, y_pred_NB, average='weighted')) # Print precision score of the Multinomial Naive Bayes model
print("Recall:", recall_score(y_test, y_pred_NB, average='weighted')) # Print the recall score of the Multinomial Naive Bayes model
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_NB)) # Print the confusion matrix of the Multinomial Naive Bayes model with heading

Naive Bayes: 
Accuracy: 0.7558666666666667
Precision: 0.7559988598339187
Recall: 0.7558666666666667

Confusion Matrix:
 [[22833  7017]
 [ 7631 22519]]


In [None]:
# Support Vector Machines (SVM)
svm = SVC(kernel='linear', C=1) # Create SVM model with linear kernel and regularization parameter of 1
svm.fit(X_train_tfidf, y_train) # Fit the training data
y_pred_svm = svm.predict(X_test_tfidf) # SVM prediction

# Print SVM results
print("Support Vector Machines:") # Print the title
print("Accuracy:", accuracy_score(y_test, y_pred_svm)) # Print accuracy score of the SVM model
print("Precision:", precision_score(y_test, y_pred_svm, average='weighted')) # Print precision score of the SVM model
print("Recall:", recall_score(y_test, y_pred_svm, average='weighted')) # Print recall score of the SVM model
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_svm)) # Print the confusion matrix of the SVM model with heading

Support Vector Machines:
Accuracy: 0.7689
Precision: 0.7692272003503053
Recall: 0.7689

Confusion Matrix:
 [[22361  7489]
 [ 6377 23773]]


In [None]:
# LSTM
# Prepare the data for LSTM
max_fatures = 2000
tokenizer = Tokenizer(max_fatures, split=' ') # Tokenize max features
tokenizer.fit_on_texts(data['text'].values) # Fit text data into tokenizer
X = tokenizer.texts_to_sequences(data['text'].values) # Convert into sequences
X = pad_sequences(X, maxlen=28) # Pad the sequence with a maximum of 28 tokens
Y = pd.get_dummies(data['sentiment']).values # Convert categorical variable to dummy variable

# Split LSTM data into training and test sets, with a random state of 42
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X, Y, 
                                                                        test_size=0.2, 
                                                                        random_state=42)

# LSTM model
lstm_model = Sequential() # Create a sequential model
lstm_model.add(Embedding(max_fatures, 128, input_length=X.shape[1])) # Add embedding layer to the model
lstm_model.add(SpatialDropout1D(0.4)) # drop out 1D features
lstm_model.add(LSTM(196, dropout=0.2, recurrent_dropout=0.2)) # Add LSTM layer with 196 units
lstm_model.add(Dense(2, activation='softmax')) # Add dense layer with 2 units
lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) # Compile the model with loss, optimizer, and metrics
print(lstm_model.summary()) # Print model summary

# Train the LSTM model
lstm_model.fit(X_train_lstm, y_train_lstm, epochs=7, batch_size=32, verbose=2)

# Evaluate the LSTM model
y_pred_lstm = lstm_model.predict(X_test_lstm)
y_pred_lstm = np.argmax(y_pred_lstm, axis=1)
y_test_lstm = np.argmax(y_test_lstm, axis=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 28, 128)           256000    
                                                                 
 spatial_dropout1d (SpatialD  (None, 28, 128)          0         
 ropout1D)                                                       
                                                                 
 lstm (LSTM)                 (None, 196)               254800    
                                                                 
 dense (Dense)               (None, 2)                 394       
                                                                 
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/7
7500/7500 - 691s - loss: 0.5106 - accuracy: 0.7458 - 691s/epoch - 92ms/step
Epoch 2/7
7500/7500 - 694

In [None]:
# Print LTSM results
print("LSTM:") # Print title
print("Accuracy:", accuracy_score(y_test_lstm, y_pred_lstm)) # Print accuracy score of the LSTM model
print("Precision:", precision_score(y_test_lstm, y_pred_lstm, average='weighted')) # Print precision score of the LSTM model
print("Recall:", recall_score(y_test_lstm, y_pred_lstm, average='weighted')) # Print the recall score of the LSTM model
print("\nConfusion Matrix:\n", confusion_matrix(y_test_lstm, y_pred_lstm)) # Print the confusion matrix of the LSTM model with heading

LSTM:
Accuracy: 0.7628833333333334
Precision: 0.7629909340658617
Recall: 0.7628833333333334

Confusion Matrix:
 [[23008  6842]
 [ 7385 22765]]
