In [None]:
import csv
from gensim.models import Word2Vec
!pip install html2text
import html2text as h2t
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd
import re
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split as tts

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
# file path could be different, this is from our google colab
labeled_train_data = pd.read_csv("/content/labeledTrainData.tsv", header = 0, delimiter = "\t", quoting = 3)
unlabeled_train_data = pd.read_csv("/content/testData.tsv", header = 0, delimiter = "\t", quoting = 3)
test_data = pd.read_csv("/content/unlabeledTrainData.tsv", header = 0, delimiter = "\t", quoting = 3)

In [None]:
def review_to_token(review_raw):
  untagged_text = h2t.html2text(review_raw)
  alltext = re.sub("[^a-zA-Z]", " ", untagged_text)
  lowercase_text = alltext.lower()
  tokenized = word_tokenize(lowercase_text)
  return tokenized

def remove_stop_words(tokenized_review):
  stop_words = set(stopwords.words('english'))
  cleaned_review = []
  for word in tokenized_review:
    if word not in stop_words:
      cleaned_review.append(word)
  return(cleaned_review)

labeled_reviews = labeled_train_data['review'].tolist()
unlabeled_reviews = unlabeled_train_data['review'].tolist()
cleaned_reviews = []
for review in labeled_reviews:
  tokenized_review = review_to_token(review)
  stop_words_removed = remove_stop_words(tokenized_review)
  cleaned_reviews.append(stop_words_removed)

def word2vec_func(reviews, vector_size=100, window=10, min_count=1, sg=0):
  w2v_model = Word2Vec(reviews, vector_size = vector_size, window = window, min_count = min_count, sg = sg)
  return(w2v_model)

In [None]:
w2v_model = word2vec_func(cleaned_reviews)

def vector_average(tokenized_review, w2v_model):
  raw_vectors = []
  for word in tokenized_review:
    if(word in w2v_model.wv):
      raw_vectors.append(w2v_model.wv[word]) # w2v_model[word] -> w2v_model.wv[word]
  return np.mean(raw_vectors, axis = 0)

review_vectors = [vector_average(review, w2v_model) for review in cleaned_reviews]

In [None]:
x = np.array(review_vectors)
y = labeled_train_data['sentiment']

x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=42)

ran_forest = RandomForestClassifier(n_estimators=100, random_state=42)
ran_forest.fit(x_train, y_train)

y_predict = ran_forest.predict(x_test)

accuracy = accuracy_score(y_test, y_predict)
print(accuracy)

In [None]:
test_reviews = test_data['review'].tolist()
cleaned_test_reviews = []
for i in test_reviews:
  tokenized_review = review_to_token(i)
  stop_words_removed = remove_stop_words(tokenized_review)
  cleaned_test_reviews.append(stop_words_removed)

test_vector = [vector_average(review, w2v_model) for review in cleaned_test_reviews]
test_vector = np.array(test_vector)

test_predictions = ran_forest.predict(test_vector)

final_df = pd.DataFrame({
    'id': test_data['id'],
    'sentiment': test_predictions
})
final_df.to_csv("submission478.csv", index = False)

final_df.head(5)

# Examining the counts of the sentiment type in each dataframe 
print(final_df['sentiment'].value_counts())
print(labeled_train_data['sentiment'].value_counts())