# Create dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import re
import os
from sklearn.model_selection import train_test_split

# Import dataset
path = '/content/drive/MyDrive/Masterthesis/data/destilled_format.csv'
df = pd.read_csv(path)
df

# Create abstract and description dataset

In [None]:
# Name your new dataset!
data_set_name = 'data_2'
data_path = '/content/drive/MyDrive/Masterthesis/data/' + data_set_name + '/'

if not os.path.isdir(data_path):
  os.mkdir(data_path)

In [None]:
def create_dataset(df, labels, data_path, idx = None):
  # Create new dataframe to save new data t0
  data = pd.DataFrame()

  # Convert categories to numberic
  data['0'] = labels 

  # Take abstract and fill up with description untill 500 words 
  N = 500
  abst = df['abstract'].apply(lambda x: str(' '.join(re.split('\s+', str(x))[:N])))
  desc = df['description'].apply(lambda x: str(' '.join(re.split('\s+', str(x))[:N])))

  text = abst + desc
  data['1'] = text.values
  data['1'] = data['1'].apply(lambda x: str(' '.join(re.split('\s+', str(x))[:N])))
  
  data['id'] = df['id'].values
  data['full_category'] = df['category'].values
  if idx != None:
    data = data[idx]

  # Write to data folder
  path = data_path + 'data.csv'
  data.to_csv(path, index = False)

In [None]:
def create_dataset_2(df, labels, data_path, idx = None):
  # Create new dataframe to save new data t0
  data = pd.DataFrame()

  # Convert categories to numberic
  data['0'] = labels 

  # Take abstract and fill up with description untill 256 words 
  N = 256
  abst = df['title'].apply(lambda x: str(' '.join(re.split('\s+', str(x))[:N])))
  desc = df['abstract'].apply(lambda x: str(' '.join(re.split('\s+', str(x))[:N])))

  text = abst + desc
  data['1'] = text.values
  data['1'] = data['1'].apply(lambda x: str(' '.join(re.split('\s+', str(x))[:N])))
  
  data['id'] = df['id'].values
  data['full_category'] = df['category'].values
  if idx != None:
    data = data[idx]

  # Write to data folder
  path = data_path + 'data.csv'
  data.to_csv(path, index = False)

# Dataset 1_2

In [None]:
# get first number from category
labels = df['category num']

# create new idexes for labels
labels, index = labels.factorize()
labels = pd.Series(labels)

# create the dataset
create_dataset_2(df, labels, data_path, idx = None)

# Leaf nodes

In [None]:
# Remove value counts less than 5
labels = df['category'].str.replace(r'\D+', '')
counts = labels.value_counts()
i = ~labels.isin(counts[counts < 5].index)
labels = labels[i]
df = df[i]

# Create new label number
labels, index = labels.factorize()
labels = pd.Series(labels)

# Save label number assignment
pd.Series(index).to_csv(data_path+'label_assignments.csv')

# Create the dataset!
create_dataset(df, labels, data_path, idx = None)

#create_dataset(df, pd.to_numeric(df['category'].str[0])-1, data_path, idx = None)

In [None]:
# get first number from category
labels = df['category'].str[0]

# create new idexes for labels
labels, index = labels.factorize()
labels = pd.Series(labels)

# create the dataset
create_dataset(df, labels, data_path, idx = None)

# The rest

In [None]:
def create_train_test(data_path):

  # Read data
  data = pd.read_csv(data_path + 'data.csv')

  # Split data
  train, test, _, _ = train_test_split(data, data['0'], test_size=0.1, stratify=data['0'], random_state = 42)

  # Save data
  train.to_csv(data_path + 'train.csv', index = False)
  test.to_csv( data_path + 'test.csv', index = False)

In [None]:
create_train_test(data_path)

# Save for LOTClass

In [None]:
lotclass_path = '/content/drive/MyDrive/Masterthesis/LOTClass/datasets/'+ data_set_name + '/'

if not os.path.isdir(lotclass_path):
  os.mkdir(lotclass_path)

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')

train['1'].to_csv(lotclass_path + 'train.txt', header=False, index=False)
test['1'].to_csv(lotclass_path + 'test.txt', header=False, index=False)
test['0'].to_csv(lotclass_path + 'test_labels.txt', header=False, index=False)

# Save for MixText

In [None]:
mixtext_path = '/content/drive/MyDrive/Masterthesis/MixText/data/' + data_set_name + '/'

if not os.path.isdir(mixtext_path):
  os.mkdir(mixtext_path)

train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path + 'test.csv')

# Add one to label to fit MixText standard
train['0'] = train['0']+1
test['0'] = test['0']+1

train['0'] = train['0']
train['2'] = train['1']
train['1'] = train['0']
train = train[['0', '1', '2', 'id',	'full_category']]

test['0'] = test['0']
test['2'] = test['1']
test['1'] = test['0']
test = test[['0', '1', '2', 'id',	'full_category']]

test.to_csv(mixtext_path + 'test.csv', index = None, header=None)
train.to_csv(mixtext_path + 'train.csv', index = None, header=None)