Imports and nltk download

In [1]:
import nltk as nltk
nltk.download('all')
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping grammars/basque_grammars.zip.
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   U

Mount the google drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


File read

In [3]:
file_path = "/content/drive/My Drive/Final project/sentiment+labelled+sentences/sentiment labelled sentences/amazon_cells_labelled.txt"  # Update this with your file path
dfAmazon = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'sentiment'])
file_path = "/content/drive/My Drive/Final project/sentiment+labelled+sentences/sentiment labelled sentences/imdb_labelled.txt"
dfIMDB = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'sentiment'])
file_path = "/content/drive/My Drive/Final project/sentiment+labelled+sentences/sentiment labelled sentences/yelp_labelled.txt"
dfYelp = pd.read_csv(file_path, sep='\t', header=None, names=['text', 'sentiment'])
df_combined = pd.concat([dfAmazon, dfIMDB, dfYelp], ignore_index=True)

In [4]:
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

In [5]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum()]
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

In [6]:
dfAmazon['text'] = dfAmazon['text'].apply(preprocess_text)
dfIMDB['text'] = dfIMDB['text'].apply(preprocess_text)
dfYelp['text'] = dfYelp['text'].apply(preprocess_text)
df_combined['text'] = df_combined['text'].apply(preprocess_text)

In [7]:
X_trainAmazon, X_testAmazon, y_trainAmazon, y_testAmazon = train_test_split(dfAmazon['text'], dfAmazon['sentiment'], test_size=0.2, random_state=42)
X_trainIMDB, X_testIMDB, y_trainIMDB, y_testIMDB = train_test_split(dfIMDB['text'], dfIMDB['sentiment'], test_size=0.2, random_state=42)
X_trainYelp, X_testYelp, y_trainYelp, y_testYelp = train_test_split(dfYelp['text'], dfYelp['sentiment'], test_size=0.2, random_state=42)
X_trainAll, X_testAll, y_trainAll, y_testAll = train_test_split(df_combined['text'], df_combined['sentiment'], test_size=0.2, random_state=42)

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidfAmazon = tfidf_vectorizer.fit_transform(X_trainAmazon)
X_test_tfidfAmazon = tfidf_vectorizer.transform(X_testAmazon)

X_train_tfidfIMDB = tfidf_vectorizer.fit_transform(X_trainIMDB)
X_test_tfidfIMDB = tfidf_vectorizer.transform(X_testIMDB)

X_train_tfidfYelp = tfidf_vectorizer.fit_transform(X_trainYelp)
X_test_tfidfYelp = tfidf_vectorizer.transform(X_testYelp)

X_train_tfidfAll = tfidf_vectorizer.fit_transform(X_trainAll)
X_test_tfidfAll = tfidf_vectorizer.transform(X_testAll)

In [9]:
clf_xgb = XGBClassifier()
clf_xgb.fit(X_train_tfidfAmazon, y_trainAmazon).fit(X_train_tfidfAmazon, y_trainAmazon)
y_predAmazon = clf_xgb.predict(X_test_tfidfAmazon)
clf_xgb.fit(X_train_tfidfIMDB, y_trainIMDB)
y_predIMDB = clf_xgb.predict(X_test_tfidfIMDB)
clf_xgb.fit(X_train_tfidfYelp, y_trainYelp)
y_predYelp = clf_xgb.predict(X_test_tfidfYelp)
clf_xgb.fit(X_train_tfidfAll, y_trainAll)
y_predAll = clf_xgb.predict(X_test_tfidfAll)

In [10]:
accuracyAmazon = accuracy_score(y_testAmazon, y_predAmazon)
print("Amazon Accuracy:", accuracyAmazon)
accuracyIMDB = accuracy_score(y_testIMDB, y_predIMDB)
print("IMDB Accuracy:", accuracyIMDB)
accuracyYelp = accuracy_score(y_testYelp, y_predYelp)
print("Yelp Accuracy:", accuracyYelp)
accuracyAll = accuracy_score(y_testAll, y_predAll)
print("All:", accuracyAll)

Amazon Accuracy: 0.7
IMDB Accuracy: 0.62
Yelp Accuracy: 0.675
All: 0.7709090909090909


In [11]:
clf = MultinomialNB()
clf.fit(X_train_tfidfAmazon, y_trainAmazon).fit(X_train_tfidfAmazon, y_trainAmazon)
y_predAmazon = clf.predict(X_test_tfidfAmazon)
clf.fit(X_train_tfidfIMDB, y_trainIMDB)
y_predIMDB = clf.predict(X_test_tfidfIMDB)
clf.fit(X_train_tfidfYelp, y_trainYelp)
y_predYelp = clf.predict(X_test_tfidfYelp)
clf.fit(X_train_tfidfAll, y_trainAll)
y_predAll = clf.predict(X_test_tfidfAll)

In [12]:
accuracyAmazon = accuracy_score(y_testAmazon, y_predAmazon)
print("Amazon Accuracy:", accuracyAmazon)
accuracyIMDB = accuracy_score(y_testIMDB, y_predIMDB)
print("IMDB Accuracy:", accuracyIMDB)
accuracyYelp = accuracy_score(y_testYelp, y_predYelp)
print("Yelp Accuracy:", accuracyYelp)
accuracyAll = accuracy_score(y_testAll, y_predAll)
print("All:", accuracyAll)

Amazon Accuracy: 0.79
IMDB Accuracy: 0.7466666666666667
Yelp Accuracy: 0.765
All: 0.7654545454545455


In [13]:
clf_rf = RandomForestClassifier()
clf_rf.fit(X_train_tfidfAmazon, y_trainAmazon).fit(X_train_tfidfAmazon, y_trainAmazon)
y_predAmazon = clf_rf.predict(X_test_tfidfAmazon)
clf_rf.fit(X_train_tfidfIMDB, y_trainIMDB)
y_predIMDB = clf_rf.predict(X_test_tfidfIMDB)
clf_rf.fit(X_train_tfidfYelp, y_trainYelp)
y_predYelp = clf_rf.predict(X_test_tfidfYelp)
clf_rf.fit(X_train_tfidfAll, y_trainAll)
y_predAll = clf_rf.predict(X_test_tfidfAll)

In [14]:
accuracyAmazon = accuracy_score(y_testAmazon, y_predAmazon)
print("Amazon Accuracy:", accuracyAmazon)
accuracyIMDB = accuracy_score(y_testIMDB, y_predIMDB)
print("IMDB Accuracy:", accuracyIMDB)
accuracyYelp = accuracy_score(y_testYelp, y_predYelp)
print("Yelp Accuracy:", accuracyYelp)
accuracyAll = accuracy_score(y_testAll, y_predAll)
print("All:", accuracyAll)

Amazon Accuracy: 0.74
IMDB Accuracy: 0.7133333333333334
Yelp Accuracy: 0.755
All: 0.7672727272727272


In [15]:
clf_log = LogisticRegression()
clf_log.fit(X_train_tfidfAmazon, y_trainAmazon).fit(X_train_tfidfAmazon, y_trainAmazon)
y_predAmazon = clf_log.predict(X_test_tfidfAmazon)
clf_log.fit(X_train_tfidfIMDB, y_trainIMDB)
y_predIMDB = clf_log.predict(X_test_tfidfIMDB)
clf_log.fit(X_train_tfidfYelp, y_trainYelp)
y_predYelp = clf_log.predict(X_test_tfidfYelp)
clf_log.fit(X_train_tfidfAll, y_trainAll)
y_predAll = clf_log.predict(X_test_tfidfAll)

In [16]:
accuracyAmazon = accuracy_score(y_testAmazon, y_predAmazon)
print("Amazon Accuracy:", accuracyAmazon)
accuracyIMDB = accuracy_score(y_testIMDB, y_predIMDB)
print("IMDB Accuracy:", accuracyIMDB)
accuracyYelp = accuracy_score(y_testYelp, y_predYelp)
print("Yelp Accuracy:", accuracyYelp)
accuracyAll = accuracy_score(y_testAll, y_predAll)
print("All:", accuracyAll)

Amazon Accuracy: 0.77
IMDB Accuracy: 0.76
Yelp Accuracy: 0.755
All: 0.7727272727272727


In [17]:
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 100

In [18]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(X_trainAll)
X_train_seq = tokenizer.texts_to_sequences(X_trainAll)
X_test_seq = tokenizer.texts_to_sequences(X_testAll)

In [19]:
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, 128, input_length=MAX_SEQUENCE_LENGTH))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_pad, y_trainAll, epochs=5, batch_size=64, validation_data=(X_test_pad, y_testAll))
loss, accuracy = model.evaluate(X_test_pad, y_testAll)
print("Test Accuracy:", accuracy)

Epoch 1/5