In [1]:
import pandas as pd
import numpy as np
import os



In [2]:
class Get_IMDB_data():
    
    def __init__(self,path, subset_size = None, random_state = None):
        self.path = path
        self.subset_size = subset_size
        self.random_state = random_state
        
    
    def get_data(self):
        path_neg_train = self.path+'/train/neg'
        path_pos_train = self.path+'/train/pos'
        
        train_data = pd.concat([self.__get_data(path_neg_train,"neg",self.subset_size),
                                self.__get_data(path_pos_train,"pos",self.subset_size)], axis = 0)
        
        path_neg_test = self.path+'/test/neg'
        path_pos_test = self.path+'/test/pos'
        
        test_data = pd.concat([self.__get_data(path_neg_test,"neg",self.subset_size),
                                self.__get_data(path_pos_test,"pos",self.subset_size)], axis = 0)
        
        return train_data,test_data
        
        
    
    def __get_data(self,path,label, subset_size = None):
        '''
        input:
            path: the path to the txt files
            label: the label for the txt files
        return:
            a pandas data frame with the txt and the corresponding label. 
        '''
        if self.random_state is not None:
            np.random.seed(self.random_state) 
            
        files = os.listdir(path)
        
        if subset_size is not None:
            if subset_size> len(files):
                raise Exception("Subset_size must be be smaller or equal to the number of text file in the directory")
            files = np.random.choice(files, subset_size, replace = False)

        data = pd.DataFrame(None, columns = ["text", "label"])

        for file in files:
            row = {}
            with open(path+"/"+file,encoding='utf8') as f:
                row["text"] = f.read()
                row["label"] = label
                data = data.append(row, ignore_index=True)

        return data

In [3]:
get_IMDB_data = Get_IMDB_data("data/aclImdb", 100,0)
train,test = get_IMDB_data.get_data()
train

Unnamed: 0,txt,label
0,So there's an old security guard and a guy who...,neg
1,I went into a Video Store and looked around to...,neg
2,REALLY? REALLY???? I know if you make a politi...,neg
3,"An annoying experience. Improvised dialogue, h...",neg
4,"There is so much that is wrong with this film,...",neg
...,...,...
95,After repeated listenings to the CD soundtrack...,pos
96,A CRY IN THE DARK <br /><br />A CRY IN THE DAR...,pos
97,"It's hard for me to assign the ""fair"" number o...",pos
98,"When John Singleton is on, he's *on*!! And thi...",pos


In [6]:
train.to_csv("data/training_subdata.csv",index=False)
test.to_csv("data/test_subdata.csv",index=False)