Import modules

In [19]:
import sys
import pandas as pd

sys.path.append("..")

from src.text_chunk import TextChunk

Import data

In [4]:
train = pd.read_csv('../data/train_chunks.csv', header=None, names=['A','B','different_author'])
val = pd.read_csv('../data/validation_chunks.csv', header=None, names=['A','B','different_author'])
imposters = pd.read_csv('../data/imposters/imposters.csv', index_col=0)

Get feature names and write a DataFrame with index -> name feature mapping

In [8]:
feature_names = TextChunk(train['A'][0]).feature_names

pd.DataFrame(data=feature_names, columns=['feature']).to_csv('../data/features/feature_names.csv')

In [16]:
number_of_features = len(feature_names)
assert number_of_features == 938

a_columns_names = ['A_{}'.format(i) for i in range(number_of_features)]
b_columns_names = ['B_{}'.format(i) for i in range(number_of_features)]
imposters_column_names = ['I_{}'.format(i) for i in range(number_of_features)]

Transform train texts to features and write as csv

In [17]:
train['A'] = train['A'].apply(lambda text: TextChunk(text).to_vector())
train['B'] = train['B'].apply(lambda text: TextChunk(text).to_vector())

train_vectors = pd.concat([pd.DataFrame(train['A'].tolist(), columns=a_columns_names),
                           pd.DataFrame(train['B'].tolist(), columns=b_columns_names),
                           train['different_author']],
                           axis=1)

train_vectors.to_csv('../data/train_features.csv')

Transform validation texts to features and write as csv

In [18]:
val['A'] = val['A'].apply(lambda text: TextChunk(text).to_vector())
val['B'] = val['B'].apply(lambda text: TextChunk(text).to_vector())

val_vectors = pd.concat([pd.DataFrame(val['A'].tolist(), columns=a_columns_names),
                         pd.DataFrame(val['B'].tolist(), columns=b_columns_names),
                         val['different_author']],
                         axis=1)

val_vectors.to_csv('../data/validation_features.csv')

Clean imposters data:
    - there were some imposters that had numbers instead of text
    - remove imposters with len < 500

In [20]:
corrupted_imposters_indices = [6872, 6876]
imposters.drop(corrupted_imposters_indices, inplace=True)

imposters = imposters[imposters['text'].apply(lambda x: len(x) > 500)]

Choose random 1000 imposters

In [22]:
random_imposters = imposters.sample(n=1000, random_state=123)
assert random_imposters.shape[0] == 1000

Transform random 1000 imposters to vectors and write as csv

In [28]:
random_imposters['features'] = random_imposters['text'].apply(lambda text: TextChunk(text).to_vector())

imposters_column_names = ['I_{}'.format(i) for i in range(number_of_features)]

imposters_vectors = pd.DataFrame(random_imposters['features'].tolist(), columns=imposters_column_names)

assert imposters_vectors.shape == (1000, 938)

imposters_vectors.to_csv('../data/imposters/imposters_subset_features.csv')