In [11]:
import numpy as np
import gensim
import os
import time
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import re
from bs4 import BeautifulSoup

In [12]:
def generate_data(subset):
    start_time = time.time()
    # Making sure subset is either train or test
    assert subset == 'train' or subset == 'test'
    
    # These arrays are going to store our 
    pos_reviews = []
    neg_reviews = []
    
    pos_path = subset+'/pos'
    neg_path = subset+'/neg'
    
    
    # Generating both positive and negative arrays with its respective labels
    for folder in os.listdir(subset):
        
        if folder == 'pos':
            for file in os.listdir(pos_path):
                file_path = pos_path+'/'+file
                f = open(file_path,'r')
                    
                # Verifying if it could read the file properly
                if(f):
                    pos_reviews.append(f.read())
                    f.close()
                    
            pos_label_array = np.ones((len(pos_reviews)))
        
        else:
            for file in os.listdir(neg_path):
                file_path = neg_path+'/'+file
                f = open(file_path,'r')
                
                # Verifying if it could read the file properly
                if(f):
                    neg_reviews.append(f.read())
                    f.close()
                    
            neg_label_array = np.zeros((len(neg_reviews)))
    
    # Joining arrays
    joint_reviews = np.concatenate((pos_reviews,neg_reviews))
    joint_labels = np.concatenate((pos_label_array,neg_label_array))
    
    # Making sure both have the same lenght
    assert len(joint_reviews) == len(joint_labels)
    print(f'Time took: {time.time() - start_time}s')
    return joint_reviews, joint_labels

In [13]:
X,Y = generate_data('train')

Time took: 4.416279315948486s


In [14]:
len(X),len(Y)

(20313, 20313)

In [41]:
def preprocessing_data(reviews_array,remove_sw= None):
    treated_array = []
    sw = stopwords.words('english')
    
    for review in reviews_array:
        
        #Removing extra whitespaces, lowering all cases in string and removing punctuation and stopwords
        review = review.lower()
        review = BeautifulSoup(review).get_text()
        review_no_punct = re.sub(r"[^\w\s]",'',review)
        review_tokens = word_tokenize(review_no_punct)
        
        if remove_sw:
            final_review = [word for word in review_tokens if word not in sw]
        else:
            final_review = review_tokens
            
        treated_array.append(" ".join(final_review).strip())
    
    assert len(reviews_array) == len(treated_array)
    return treated_array

In [46]:
X_treated_w_sw = preprocessing_data(X[:50])
X_treated_wout_sw = preprocessing_data(X[:50], remove_sw = True)

In [47]:
X_treated[0]

'clint eastwood has definitely produced better movies than this but this one does not embarrass him dirty harry catches everyones attention and unless one wants to watch romance there is no reason why you wont like him he is cool because he is dirty is great because he kills without much thinking is perfect because he gets the bullet right through your heart and a hero because he doesnt carefrom what i have seen in movies in which eastwood acts the character of the lead role always captivates the audience in white hunter black heart he is the crazy director in in the line of fire he is the old un while here is the almost jobless with his job that is to say he makes work for himself doesnt care one damn about his superiors who practically send him out for a vacationbased on a rape victim this movie is promising for all the no nonsense movie watchers the movie has nothing that goes away from he central plot however what makes it slightly inferior to the better movies of eastwood is that 

In [49]:
X_treated_wout_sw[0]

'clint eastwood definitely produced better movies one embarrass dirty harry catches everyones attention unless one wants watch romance reason wont like cool dirty great kills without much thinking perfect gets bullet right heart hero doesnt carefrom seen movies eastwood acts character lead role always captivates audience white hunter black heart crazy director line fire old un almost jobless job say makes work doesnt care one damn superiors practically send vacationbased rape victim movie promising nonsense movie watchers movie nothing goes away central plot however makes slightly inferior better movies eastwood though character lead role captivating plot far obvious beginning movie going make sit place without moving also many people far dirtier dirty harry'

In [48]:
X[0]

'Clint Eastwood has definitely produced better movies than this, but this one does not embarrass him. Dirty Harry catches everyone\'s attention and unless one wants to watch romance, there is no reason why you won\'t like him. He is cool because he is dirty, is great because he kills without much thinking, is perfect because he gets the bullet right through your heart and a hero because he doesn\'t care.<br /><br />From what I have seen in movies in which Eastwood acts, the character of the lead role always captivates the audience. In White Hunter Black heart, he is the crazy director, in "in the Line of Fire" he is the "Old \'un" while here is the "almost" jobless with his job, that is to say he makes work for himself, doesn\'t care one damn about his superiors who practically send him out for a vacation.<br /><br />Based on a rape victim, this movie is promising for all the "no non-sense" movie watchers. The movie has nothing that goes away from he central plot. However, what makes i