In [1]:
import numpy as np
import pandas as pd

In [2]:
FNAME = '../../datos/imdb_tokenized.csv'

In [3]:
from collections import defaultdict
def parse_dataset(fname):
    data = defaultdict(list)
    with open(fname, 'r') as f:
        for line in f:
            entry_id, dataset, response, tokens = line.split(',', 3)
            data['id'].append(entry_id)
            data['dataset'].append(dataset)
            data['response'].append(response)
            data['tokens'].append(tokens)
    return pd.DataFrame.from_dict(data)

def dataset_to_csv(dataset, fname):
    with open(fname, 'w') as f:
        for row in dataset.iterrows():
            line = ','.join([row[1]['id'], row[1]['dataset'], row[1]['response'], row[1]['tokens']])
            f.write(line)


In [4]:
df = parse_dataset(FNAME)
df.head()

Unnamed: 0,id,dataset,response,tokens
0,0,test,neg,"277,174,578,7118,48,3325,49,3,17,16,225,1113,7..."
1,1,test,neg,"11,6,36,454,4,137,1,2146,4,221,109,26,1,167,38..."
2,2,test,neg,"93,4,33,10,722,147,5212,15645,37,35019,490,47,..."
3,3,test,neg,"25,60,1,6161,80,872,691,286,413,2,252,2423,232..."
4,4,test,neg,"9603,1271,104,6,25,3,3706,656,16,97,67,26,635,..."


In [5]:
train = df[df['dataset'] == 'train']
test =  df[df['dataset'] == 'test']
train.head()

Unnamed: 0,id,dataset,response,tokens
25000,25000,train,neg,"71,4,3,127,37,48,7464,1352,16,3,5214,511,49,15..."
25001,25001,train,neg,"3785,23670,511,14,3,3347,169,8894,12062,1481,6..."
25002,25002,train,neg,"11,19,3187,143,10,80,23,265,59,3811,21,32,93,3..."
25003,25003,train,neg,"726,286,10,122,11,6,425,5,30,36,24,529,22,19,1..."
25004,25004,train,neg,"53,10,13,121,59,737,522,75,329,5,1,738,5,69,91..."


In [6]:
# Tomamos una muestras de los datasets y generamos un nuevo dataset
n_samples = 2500
train_sample = train.sample(n_samples)
test_sample = test.sample(n_samples)
train_test_sample = pd.concat([train_sample, test_sample])
train_test_sample.head()

Unnamed: 0,id,dataset,response,tokens
27377,27377,train,neg,"10,184,23,255,1,1117,4,24696,5891,11585,2,2547..."
46618,46618,train,pos,"4,147,744,130,159,1740,10,629,979,15,72234,1,7..."
35528,35528,train,neg,"339,12,3,168,4,197,109,26,442,21,1,820,12,31,4..."
43056,43056,train,pos,"10,122,12,20,141,455,5675,11,9,19,7639,42,32,2..."
45890,45890,train,pos,"10,7513,9,3,1679,48718,2,11,19,13,568,103,73,7..."


In [7]:
# Guardamos la muestra en un archivo
OUT_NAME = '../data/train_test_sample.csv'
dataset_to_csv(train_test_sample, OUT_NAME)

In [8]:
ls

Datasets.ipynb                        HyperparameterTuning-StopWords.ipynb
[34mExperimentos_4[m[m/                       Metodo de Potencias.ipynb
[34mExperimentos_5[m[m/                       train_test_sample.csv
Hyperparameter Tuning.ipynb


In [9]:
# Tomamos una muestras solo del dataset de test y validemos con el dataset de train
n_samples_train = 3000
n_samples_val = 1000
train_sample = train.sample(n_samples_train)
val_sample = train.loc[~train.index.isin(train_sample.index)].sample(n_samples_val)
train_test_sample = pd.concat([train_sample, test_sample])

OUT_NAME = '../data/train_test_sample.csv'
dataset_to_csv(train_test_sample, OUT_NAME)

In [10]:
ls

Datasets.ipynb                        HyperparameterTuning-StopWords.ipynb
[34mExperimentos_4[m[m/                       Metodo de Potencias.ipynb
[34mExperimentos_5[m[m/                       train_test_sample.csv
Hyperparameter Tuning.ipynb


In [12]:
#Armo los distintos training sets

for i in range(1000,25001,1000):
    test_sample = test.sample(15000)
    train_sample = train.sample(i)
    train_test_sample = pd.concat([train_sample, test_sample])
    OUT_NAME = '../data/train_test_sample_' + str(i) + '.csv'
    dataset_to_csv(train_test_sample, OUT_NAME)