In [51]:
# loading in datasets
import pandas as pd

book_reviews = pd.read_csv('book_reviews.csv')
movie_review = pd.read_csv('IMDB Dataset.csv')
restaurant_reviews = pd.read_csv('restaurant_reviews.csv')

test_set = pd.read_csv('sentiment-topic-final-test.tsv', sep='\t')

test_set.head(5)
print(len(book_reviews))
print(len(movie_review))
print(len(restaurant_reviews))


12000
50000
19896


In [52]:
# clean dataframes of unneeded features
print(book_reviews.columns)
print(movie_review.columns)
print(restaurant_reviews.columns)

# we are only interseted in the 'review texts' so we will remove 
# all other features

book_reviews = book_reviews.drop(columns=['Unnamed: 0', 'rating', 'summary'])
movie_review = movie_review.drop(columns=['sentiment'])
restaurant_reviews = restaurant_reviews.drop(columns=['Yelp URL', 'Rating', 'Date'])

book_reviews = book_reviews.rename(columns={'reviewText': 'text'})
movie_review = movie_review.rename(columns={'review': 'text'})
restaurant_reviews = restaurant_reviews.rename(columns={'Review Text': 'text'})


Index(['Unnamed: 0', 'rating', 'reviewText', 'summary'], dtype='object')
Index(['review', 'sentiment'], dtype='object')
Index(['Yelp URL', 'Rating', 'Date', 'Review Text'], dtype='object')


In [53]:
# remove all empty cells and add a collumn with the topic for each dataset
book_reviews.dropna(inplace=True)
movie_review.dropna(inplace=True)
restaurant_reviews.dropna(inplace=True)

book_reviews['topic'] = 'book'
movie_review['topic'] = 'movie'
restaurant_reviews['topic'] = 'restaurant'

book_reviews.head(5)

Unnamed: 0,text,topic
0,This book was the very first bookmobile book I...,book
1,"When I read the description for this book, I c...",book
2,I just had to edit this review. This book is a...,book
3,I don't normally buy 'mystery' novels because ...,book
4,"This isn't the kind of book I normally read, a...",book


In [54]:
# sample points from each dataset

book_reviews_sample = book_reviews.sample(n=12000, random_state=42)
movie_review_sample = movie_review.sample(n=12000, random_state=42)
restaurant_reviews_sample = restaurant_reviews.sample(n=12000, random_state=42)

train_set = pd.concat([book_reviews_sample, movie_review_sample, restaurant_reviews_sample])

# merge datasets together
train_set = train_set.sample(frac=1, random_state=42).reset_index(drop=True)

print(len(train_set))
print((train_set))
train_set.head(5)


36000
                                                    text       topic
0      I'm grateful for one thing and one thing only ...       movie
1      This 1984 version of the Dickens' classic `A C...       movie
2      It's been nearly 30 years, and I STILL hate ev...       movie
3      Not really crazy about the ice cream. Some fla...  restaurant
4      I got this when it was a Kindle freebie, and I...        book
...                                                  ...         ...
35995  I actually saw this movie at a cinema. At the ...       movie
35996  This was a free kindle book that I got a coupl...        book
35997  Dee crafts an exciting story of time travel an...        book
35998  WTH:  there are so many inconsistencies in thi...        book
35999  The only reason I remember this movie is becau...       movie

[36000 rows x 2 columns]


Unnamed: 0,text,topic
0,I'm grateful for one thing and one thing only ...,movie
1,This 1984 version of the Dickens' classic `A C...,movie
2,"It's been nearly 30 years, and I STILL hate ev...",movie
3,Not really crazy about the ice cream. Some fla...,restaurant
4,"I got this when it was a Kindle freebie, and I...",book


In [55]:
# preprosess the datasets (also the test set)
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import LabelEncoder

train_set['text'] = train_set['text'].str.lower()

translator = str.maketrans('', '', string.punctuation)
train_set['text'] = train_set['text'].apply(lambda x: x.translate(translator))

stop_words = set(stopwords.words('english'))
train_set['text'] = train_set['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

train_set['text'] = train_set['text'].apply(word_tokenize)

stemmer = PorterStemmer()
train_set['text'] = train_set['text'].apply(lambda x: [stemmer.stem(word) for word in x])

encoder = LabelEncoder()
train_set['topic'] = encoder.fit_transform(train_set['topic'])

mapping = {'book': 0, 'movie': 1, 'restaurant': 2}

test_set['topic'] = test_set['topic'].replace(mapping)

test_set_processed = test_set.copy() 
test_set_processed['text'] = test_set_processed['text'].str.lower()

translator = str.maketrans('', '', string.punctuation)
test_set_processed['text'] = test_set_processed['text'].apply(lambda x: x.translate(translator))

stop_words = set(stopwords.words('english'))
test_set_processed['text'] = test_set_processed['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

test_set_processed['text'] = test_set_processed['text'].apply(word_tokenize)

stemmer = PorterStemmer()
test_set_processed['text'] = test_set_processed['text'].apply(lambda x: [stemmer.stem(word) for word in x])

print(train_set)

                                                    text  topic
0      [im, grate, one, thing, one, thing, woman, tho...      1
1      [1984, version, dicken, classic, christma, car...      1
2      [nearli, 30, year, still, hate, everyon, invol...      1
3      [realli, crazi, ice, cream, flavor, cloyingli,...      2
4      [got, kindl, freebi, realli, happi, stori, gre...      0
...                                                  ...    ...
35995  [actual, saw, movi, cinema, time, work, shift,...      1
35996  [free, kindl, book, got, coupl, month, ago, re...      0
35997  [dee, craft, excit, stori, time, travel, love,...      0
35998  [wth, mani, inconsist, stori, scene, make, cri...      0
35999  [reason, rememb, movi, still, biggest, wast, t...      1

[36000 rows x 2 columns]


In [87]:
# train SVM model on training data 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


# for list comprehension
def my_tokenizer(text):
    return text

vectorizer = CountVectorizer(tokenizer=my_tokenizer, lowercase=False)
X_train = vectorizer.fit_transform(train_set['text'])
y_train = train_set['topic']
X_test = vectorizer.transform(test_set['text'])
y_test = test_set['topic']

svm = SVC(C=10, kernel='rbf',gamma=0.001)

# Split the training set into a training subset and a validation subset
X_train_sub, X_val, y_train_sub, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Fit the SVM model on the training subset
svm.fit(X_train_sub, y_train_sub)

# Predict on the validation subset and evaluate performance
y_pred_val = svm.predict(X_val)
print(classification_report(y_val, y_pred_val))

# Fit the SVM model on the full training set
svm.fit(X_train, y_train)

# Predict on the test set and evaluate performance
y_pred_test = svm.predict(X_test)
print(classification_report(y_test, y_pred_test))



              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2468
           1       0.99      0.98      0.99      2369
           2       1.00      0.99      1.00      2363

    accuracy                           0.99      7200
   macro avg       0.99      0.99      0.99      7200
weighted avg       0.99      0.99      0.99      7200

              precision    recall  f1-score   support

           0       0.20      1.00      0.33         2
           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00         3

    accuracy                           0.20        10
   macro avg       0.07      0.33      0.11        10
weighted avg       0.04      0.20      0.07        10



  _warn_prf(average, modifier, msg_start, len(result))


In [88]:
test_set['pred_topic'] = y_pred_test
test_set

Unnamed: 0,sentence id,text,sentiment,topic,pred_topic
0,0,It took eight years for Warner Brothers to rec...,negative,1,0
1,1,All the New York University students love this...,positive,2,0
2,2,This Italian place is really trendy but they h...,negative,2,0
3,3,"In conclusion, my review of this book would be...",positive,0,0
4,4,The story of this movie is focused on Carl Bra...,neutral,1,0
5,5,Chris O'Donnell stated that while filming for ...,neutral,1,0
6,6,My husband and I moved to Amsterdam 6 years ag...,positive,2,0
7,7,Dame Maggie Smith performed her role excellent...,positive,1,0
8,8,The new movie by Mr. Kruno was shot in New Yor...,neutral,1,0
9,9,"I always have loved English novels, but I just...",negative,0,0


In [6]:
!pip install imblearn



In [7]:
pip install -U scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from imblearn.under_sampling import RandomUnderSampler
from scipy.sparse import vstack

def my_tokenizer(text):
    return text

# vectorize the text data
vectorizer = CountVectorizer(tokenizer=my_tokenizer, lowercase=False)
X_train = vectorizer.fit_transform(train_set['text'])
y_train = train_set['topic']
X_test = vectorizer.transform(test_set['text'])
y_test = test_set['topic']

# split the train_set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# fit LDA on the training set
lda = LatentDirichletAllocation(n_components=3, max_iter=5, learning_method='online',
                                learning_offset=50.,random_state=0)
lda.fit(X_train)
X_train_lda = lda.transform(X_train)
X_val_lda = lda.transform(X_val)

rus = RandomUnderSampler()
X_train_lda_resampled, y_train_resampled = rus.fit_resample(X_train_lda, y_train)

# concatenate X_train_lda_resampled and X_val_lda
X_lda_combined = vstack([X_train_lda_resampled, lda.transform(X_val)])

# concatenate y_train_resampled and y_val
y_combined = pd.concat([pd.Series(y_train_resampled), y_val], axis=0)

# fit SVM on LDA features
svm = SVC(C=1.0, kernel='rbf', degree=3, gamma='scale', coef0=0.0, shrinking=True, 
            probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, 
            max_iter=-1, decision_function_shape='ovr', break_ties=False, random_state=None)
svm.fit(X_lda_combined, y_combined)

# evaluate performance on the validation set
y_val_pred = svm.predict(lda.transform(X_val))
print(classification_report(y_val, y_val_pred))

y_test_pred = svm.predict(lda.transform(X_test))
print("Classification report of SVM with LDA features")
print(classification_report(y_test, y_test_pred))


              precision    recall  f1-score   support

           0       0.77      0.85      0.81      2468
           1       0.83      0.74      0.78      2369
           2       1.00      0.99      0.99      2363

    accuracy                           0.86      7200
   macro avg       0.87      0.86      0.86      7200
weighted avg       0.86      0.86      0.86      7200

Classification report of SVM with LDA features
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         5
           2       0.00      0.00      0.00         3

    accuracy                           0.50        10
   macro avg       0.17      0.33      0.22        10
weighted avg       0.25      0.50      0.33        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
test_set['pred_topic'] = y_test_pred
test_set

In [59]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# tokenize the text data
tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(train_set['text'])

# convert the text data to sequences
X_train_seq = tokenizer.texts_to_sequences(train_set['text'])
X_test_seq = tokenizer.texts_to_sequences(test_set['text'])

# pad the sequences
max_len = max([len(x) for x in X_train_seq])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# create the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_len))
model.add(LSTM(units=128))
model.add(Dense(units=3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# convert the labels to one-hot encoded vectors
y_train = pd.get_dummies(train_set['topic']).values
y_test = pd.get_dummies(test_set['topic']).values

# split the train_set into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_pad, y_train, test_size=0.2, random_state=42)

# train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=5, batch_size=64)

# evaluate performance on the validation set
y_val_pred = model.predict(X_val)
y_val_pred = np.argmax(y_val_pred, axis=1)
y_val_true = np.argmax(y_val, axis=1)
print(classification_report(y_val_true, y_val_pred))

# evaluate performance on the test set
y_test_pred = model.predict(X_test_pad)
y_test_pred = np.argmax(y_test_pred, axis=1)
y_test_true = np.argmax(y_test, axis=1)
print("Classification report of LSTM")
print(classification_report(y_test_true, y_test_pred))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      2468
           1       0.33      1.00      0.50      2369
           2       0.00      0.00      0.00      2363

    accuracy                           0.33      7200
   macro avg       0.11      0.33      0.17      7200
weighted avg       0.11      0.33      0.16      7200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification report of LSTM
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         5
           2       0.00      0.00      0.00         3

    accuracy                           0.50        10
   macro avg       0.17      0.33      0.22        10
weighted avg       0.25      0.50      0.33        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [60]:
test_set['pred_topic'] = y_test_pred
test_set

Unnamed: 0,sentence id,text,sentiment,topic,pred_topic
0,0,It took eight years for Warner Brothers to rec...,negative,1,1
1,1,All the New York University students love this...,positive,2,1
2,2,This Italian place is really trendy but they h...,negative,2,1
3,3,"In conclusion, my review of this book would be...",positive,0,1
4,4,The story of this movie is focused on Carl Bra...,neutral,1,1
5,5,Chris O'Donnell stated that while filming for ...,neutral,1,1
6,6,My husband and I moved to Amsterdam 6 years ag...,positive,2,1
7,7,Dame Maggie Smith performed her role excellent...,positive,1,1
8,8,The new movie by Mr. Kruno was shot in New Yor...,neutral,1,1
9,9,"I always have loved English novels, but I just...",negative,0,1
