In [1]:
# import libraries
import pandas as pd
import numpy as np
import nltk
import re
import logging
import time 
import matplotlib.pyplot as plt
import string
from PIL import Image
from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
from sklearn.cluster import KMeans 
from wordcloud import WordCloud, STOPWORDS
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from nltk.corpus import wordnet
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

## Load Data and Data Proprocessing

In [2]:
# load data
train = pd.read_csv("../data/labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("../data/testData.tsv", header=0, delimiter="\t", quoting=3)
df_train1=pd.read_csv("../data/imdb_master.csv",encoding="latin-1")

#To ensure sufficient data set, mix all 50,000 pieces of data in label and test set.

test.insert(1, 'sentiment', 0)
for i in range(len(test)):
    if int(re.sub("[^0-9]", " ", test["id"][i].split("_")[1])) <= 5:
        test['sentiment']=0
    elif int(re.sub("[^0-9]", " ", test["id"][i].split("_")[1])) > 5:
        test['sentiment']=1

# Divided into train set and test set according to 8: 2
test_set,train_set=train_test_split(test,test_size=0.6,random_state=1)
train_set=train.append(train_set)
                                 

#Processing the imported external data, eliminating irrelevant and unsupervised items and unifying the format.
df_train1=df_train1.drop(["type",'file'],axis=1)
df_train1.rename(columns={'label':'sentiment',
                          'Unnamed: 0':'id',
                          'review':'review'}, 
                 inplace=True)               
df_train1 = df_train1[df_train1.sentiment != 'unsup']
maping = {'pos': 1, 'neg': 0}
df_train1['sentiment'] = df_train1['sentiment'].map(maping)

train_set=train_set.reset_index(drop=True)
test_set=test_set.reset_index(drop=True)

## Train the Word2vec Model

In [3]:
#Data cleaning function
def review_to_wordlist( review, remove_stopwords=False ):
    
    #Use the BeautifulSoup library to get rid of the original text and remove symbols such as < br/>.
    review_text = BeautifulSoup(review).get_text()

    #Remove non-letters with regular expression
    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    #Convert to lower case, split into individual words
    words = review_text.lower().split()

    #Create set of stopwords and remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
        
    #return the result.
    return(words)

#Load the punkt tokenizer and break the paragraph into different sentences, because Word2Vec needs a single sentence.
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    
    #Define a function to split a comment into parsed sentences.
    raw_sentences = tokenizer.tokenize(review.strip())

    #Function to split a comment into parsed sentences.
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append( review_to_wordlist( raw_sentence, remove_stopwords ))

    #Returns a list of lists
    return sentences
sentences = []  

for review in train_set["review"]:
    sentences += review_to_sentences(review, tokenizer)
    

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)

#Set values for various parameters
num_features = 300                   
min_word_count = 40                     
num_workers = 4       
context = 10                                                                                            
downsampling = 1e-3   

# Use Word2Vec algorithm
from gensim.models import word2vec
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            vector_size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)


model.init_sims(replace=True)

# save the model
model_name = "300features_40minwords_10context"
model.save(model_name)

2022-04-21 22:27:09,584 : INFO : collecting all words and their counts
2022-04-21 22:27:09,587 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-04-21 22:27:09,617 : INFO : PROGRESS: at sentence #10000, processed 225908 words, keeping 17776 word types
2022-04-21 22:27:09,647 : INFO : PROGRESS: at sentence #20000, processed 452097 words, keeping 24953 word types
2022-04-21 22:27:09,679 : INFO : PROGRESS: at sentence #30000, processed 671984 words, keeping 30044 word types
2022-04-21 22:27:09,712 : INFO : PROGRESS: at sentence #40000, processed 898757 words, keeping 34358 word types
2022-04-21 22:27:09,747 : INFO : PROGRESS: at sentence #50000, processed 1122100 words, keeping 37824 word types
2022-04-21 22:27:09,781 : INFO : PROGRESS: at sentence #60000, processed 1341946 words, keeping 40782 word types
2022-04-21 22:27:09,820 : INFO : PROGRESS: at sentence #70000, processed 1566792 words, keeping 43383 word types
2022-04-21 22:27:09,854 : INFO : PROGRESS: 

In [4]:
#Using K-means clustering method
from sklearn.cluster import KMeans
import time

start = time.time() 

#Set k(num _ clusters) to 1/5 of the vocabulary, or an average of 5 words per cluster.
word_vectors = model.wv.vectors
num_clusters = int(word_vectors.shape[0] / 5)

#Initialize a k-means object and use it to extract the centroid. The incoming K-means must be of type int.
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

end = time.time()
elapsed = end - start
print("Time taken for K Means clustering: ", elapsed, "seconds.")

Time taken for K Means clustering:  330.8599421977997 seconds.


In [5]:
#Create a word/index dictionary and map each vocabulary word to a cluster number.             
word_centroid_map = dict(zip( model.wv.index_to_key, idx ))
for cluster in range(0,10):

    print("\nCluster %d" % cluster)
    words = []
    for i in range(0,len(word_centroid_map.values())):
        if(list(word_centroid_map.values())[i] == cluster):
            words.append(list(word_centroid_map.keys())[i])
    print(words)


Cluster 0
['controlled', 'motivated', 'consumed', 'insulted', 'blinded']

Cluster 1
['bush', 'advertising', 'counter', 'campaign', 'ch', 'coverage', 'administration', 'newspapers']

Cluster 2
['disguised', 'posing', 'glorified', 'monstrous']

Cluster 3
['monroe', 'marilyn', 'lana']

Cluster 4
['window', 'cutting', 'lights', 'doors', 'walls', 'dust', 'buildings', 'pan', 'rooms', 'trees', 'equipment', 'dirt', 'tube', 'windows', 'furniture', 'pans', 'lens', 'zoom', 'roads', 'boxes', 'corners', 'chairs', 'filter']

Cluster 5
['bloom', 'heath', 'liam', 'ledger', 'turturro', 'leonardo', 'hayden', 'cathy', 'dicaprio', 'orlando', 'neeson']

Cluster 6
['it', 'this']

Cluster 7
['self', 'indulgent', 'conscious', 'absorbed', 'righteous', 'proclaimed', 'indulgence', 'consciously']

Cluster 8
['pitt', 'sutherland', 'jamie', 'macy', 'jeffrey', 'bridges', 'nicholas', 'paxton', 'pierce', 'reliable', 'busey', 'pegg', 'arkin', 'dafoe', 'bon', 'dominic', 'wahlberg', 'bernsen', 'keitel', 'pertwee', 'syke

In [6]:
clean_train_reviews = []
for review in train_set["review"]:
    clean_train_reviews.append( review_to_wordlist( review, remove_stopwords=True ))

clean_test_reviews = []
for review in test_set["review"]:
    clean_test_reviews.append( review_to_wordlist( review, remove_stopwords=True ))

#The centroid bag function converts comments into centroid bags    
def create_bag_of_centroids( wordlist, word_centroid_map ):

    #The number of clusters is equal to the highest cluster index in the word/centroid graph.
    num_centroids = max( word_centroid_map.values() ) + 1

    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )

    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1

    # Return the "bag of centroids"
    return bag_of_centroids

#Assign an array (for speed) to the training set package of the centroid in advance.
train_centroids = np.zeros( (train_set["review"].size, num_clusters), \
    dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros(( test_set["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

## Random Forest Algorithm

In [7]:
#Fitting random forest and extracting prediction
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids,train_set["sentiment"])
result = forest.predict(test_centroids)

output = pd.DataFrame(data={"id":test_set["id"], "sentiment":test_set["sentiment"],"sentiment_new":result})
output.to_csv( "BagOfCentroids.csv", index=False, quoting=3 )

Fitting a random forest to labeled training data...


In [8]:
#judge the result
count=0
model = pd.read_csv("BagOfCentroids.csv");
for i in range(model.shape[0]):
    if model["sentiment"][i]==model["sentiment_new"][i]:
        count=count+1
        
print(count/model.shape[0])

0.8799
