In [1]:
from google.colab import drive 

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!ls "/content/drive/MyDrive/AIAssignment2/imdb"

test.json  train.json  valid.json


In [4]:
import nltk
import spacy
import pandas as pd

nltk.download('stopwords') 
path_dir = '/content/drive/MyDrive/AIAssignment2/imdb'

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
df_reviews_train = pd.read_json(path_dir + "/train.json")

In [6]:
print(df_reviews_train.head())
print(df_reviews_train.iloc[0]['text'])

                                                text  label  \
0  Robin Williams does his best to combine comedy...      1   
1  I have to hand it to the creative team behind ...      1   
2  Webs starts in 'Chicago: Present Day' as four ...      0   
3  If you like horror movies with lots of blood a...      1   
4  I saw this film under the title of "Tied Up". ...      0   

                                                ents  \
0  [[Robin Williams, 0, 14, 0.5], [Donald Moffat,...   
1  [[American Pie, 53, 72, 0.477818846702575], [A...   
2  [[Richard Grieco, 66, 80, 0.5], [Richard Yearw...   
3  [[horror, 12, 23, 0.35198992490768405], [jump-...   
4                       [[the price, 445, 462, 0.5]]   

                                                 ann  
0  [[Q83338, 0, 14, 0.5], [Q1240192, 87, 100, 0.5...  
1  [[Q220713, 53, 65, 0.477818846702575], [Q22071...  
2  [[Q960809, 66, 80, 0.5], [Q3431337, 88, 104, 0...  
3  [[Q200092, 12, 18, 0.35198992490768405], [Q186...  
4        

In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Download stopwords and wordnet
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
def text_preprocessing(text):
    # tokenization, remove punctuation, and lemmatization
    words_lemma=[token.lemma_ for token in nlp(text) if not token.is_punct]
    # remove symbols, websites, email addresses
    words_clean = clean_text(words_lemma)
    # remove stopwords
    words_sp = [word.lower() for word in words_clean if word.lower() not in stopwords_lower]
    # combine token list into string  
    s_text = " ".join(words_sp)
    return s_text

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

# import first 10 reviews
texts = df_reviews_train['text'].to_list()
texts = texts[:10]

# initialize function and generate term-frequency-document matrix
vectorizer = CountVectorizer()
vectorizer.fit(texts)
vector = vectorizer.transform(texts)

print("Vocabulary size: {}".format(len(vectorizer.vocabulary_)))
#print out vocabulary
print(vectorizer.vocabulary_)
#print doc-word matrix
print(vector.toarray())

Vocabulary size: 1083
[[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 1 0 ... 1 0 1]
 ...
 [0 0 0 ... 0 1 0]
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 2 0]]


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tf = TfidfVectorizer(stop_words='english')
vectorizer_tf.fit(texts)
vector_tf = vectorizer_tf.transform(texts)

print("Vocabulary size: {}".format(len(vectorizer_tf.vocabulary_)))
#print out vocabulary
print(vectorizer_tf.vocabulary_)
#print TF-IDF matrix
print(vector_tf.toarray())

Vocabulary size: 903
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.06034034 0.         0.         ... 0.         0.         0.        ]
 [0.         0.04148171 0.         ... 0.04148171 0.03526325 0.04148171]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.11543003 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [11]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize, word_tokenize

texts = df_reviews_train['text'].to_list()
# tokenize texts before feeding into the model training
data = []
# iterate through each sentence in the file
for t in texts:
    temp = []
    for i in sent_tokenize(t):
        # tokenize the sentence into words
        for j in word_tokenize(i):
            temp.append(j.lower())
    data.append(temp)

# CBOW
model = Word2Vec(data, window=5, min_count=1, workers=4)
model.save("word2vec.model")

In [12]:
import numpy as np

In [13]:
from gensim.models import KeyedVectors

# Access the index of a word
word_index = model.wv.key_to_index['word']
new_vector = np.array([])
# Access the word corresponding to an index
word = model.wv.index_to_key[word_index]

# Access the vector of a word
vector = model.wv.get_vector('word')

# Update the vector of a word
model.wv.set_vecattr('word', 'vector', new_vector)

In [14]:
review_embeddings = []
for review in texts:
    # Split the review into individual words
    words = review.split()
    # Get the word embeddings for each word in the review
    word_embeddings = [model.wv.get_vector(w) for w in words if w in model.wv.key_to_index]
    # Calculate the average word embedding for the review
    review_embedding = np.mean(word_embeddings, axis=0)
    review_embeddings.append(review_embedding)

# Convert the list of review embeddings to a numpy array
review_embeddings = np.array(review_embeddings)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  review_embeddings = np.array(review_embeddings)


In [15]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Split the dataset into training and test sets
text_train, text_test, labels_train, labels_test = train_test_split(df_reviews_train['text'], df_reviews_train['label'], test_size=0.2, random_state=42)

from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer()

# Fit the vectorizer to the training data and transform the training data
text_train_vec = vectorizer.fit_transform(text_train)

# Transform the test data using the fitted vectorizer
text_test_vec = vectorizer.transform(text_test)

# Train the GaussianNB model
nb_model = GaussianNB()
nb_model.fit(text_train_vec.toarray(), labels_train)

# Train the SVM model
svm_model = SVC(kernel='linear')
svm_model.fit(text_train_vec.toarray(), labels_train)

# Train the logistic regression model
lr_model = LogisticRegression()
lr_model.fit(text_train_vec.toarray(), labels_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
  # Use the trained models to make predictions on the test set
nb_preds = nb_model.predict(text_test_vec.toarray())
svm_preds = svm_model.predict(text_test_vec.toarray())
lr_preds = lr_model.predict(text_test_vec.toarray())

# Calculate the accuracy scores for each model
nb_accuracy = accuracy_score(labels_test, nb_preds)
svm_accuracy = accuracy_score(labels_test, svm_preds)
lr_accuracy = accuracy_score(labels_test, lr_preds)

print("GaussianNB accuracy:", nb_accuracy)
print("SVM accuracy:", svm_accuracy)
print("Logistic regression accuracy:", lr_accuracy)

GaussianNB accuracy: 0.8285714285714286
SVM accuracy: 0.9289285714285714
Logistic regression accuracy: 0.945
