In [82]:
import pandas as pd
import sys
sys.path.append("../src/data_processing")
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import yaml
import pyarrow as pa
import pyarrow.parquet as pq

from preprocess import remove_html_tags, stopwords_removal, lower_case

In [83]:
with open("../src/data_processing/config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [84]:
dataset_path = config['dataset']['path']
file_names = config['dataset']['filename']

In [85]:
df_train = pd.read_csv(dataset_path + file_names[0])
df_test = pd.read_csv(dataset_path + file_names[1])

In [86]:
df_train.shape, df_test.shape

((30000, 2), (20000, 2))

Saving Un-Processed Text

In [87]:
with tqdm(total=len(df_train), desc="Writing Train Parquet") as pbar:
    table = pa.Table.from_pandas(df_train)
    pq.write_table(table, '../dataset/processed/train.parquet')
    pbar.update(len(df_train))
    
with tqdm(total=len(df_test), desc="Writing Test Parquet") as pbar:
    table = pa.Table.from_pandas(df_test)
    pq.write_table(table, '../dataset/processed/test.parquet')
    pbar.update(len(df_test))

Writing Train Parquet: 100%|██████████| 30000/30000 [00:00<00:00, 120609.27it/s]
Writing Test Parquet: 100%|██████████| 20000/20000 [00:00<00:00, 126111.86it/s]


In [88]:
df_train.head()

Unnamed: 0,review,sentiment
0,SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...,negative
1,"If you want mindless action, hot chicks and a ...",positive
2,"""The Woman in Black"" is easily one of the cree...",positive
3,I can barely find the words to describe how mu...,negative
4,What's in here ?! Let me tell you. It's the pr...,negative


In [89]:
df_test.head()

Unnamed: 0,review,sentiment
0,Steven Rea plays a forensic scientist thrust o...,positive
1,As the first of the TV specials offered on the...,positive
2,There may something poetically right in seeing...,negative
3,all i can say about this film is to read the b...,negative
4,I thought it was a pretty good movie and shoul...,positive


Removing HTML Tags

In [90]:
df_train['review'] = df_train['review'].apply(remove_html_tags)
df_test['review'] = df_test['review'].apply(remove_html_tags)

Converting to Lower Caps

In [91]:
df_train['review'] = df_train['review'].apply(lower_case)
df_test['review'] = df_test['review'].apply(lower_case)

Removing Stopwords

In [92]:
df_train['review'] = df_train['review'].apply(stopwords_removal)
df_test['review'] = df_test['review'].apply(stopwords_removal)

Saving Processed Text

In [93]:
with tqdm(total=len(df_train), desc="Writing Train Parquet") as pbar:
    table = pa.Table.from_pandas(df_train)
    pq.write_table(table, '../dataset/processed/train.parquet')
    pbar.update(len(df_train))
    
with tqdm(total=len(df_test), desc="Writing Test Parquet") as pbar:
    table = pa.Table.from_pandas(df_test)
    pq.write_table(table, '../dataset/processed/test.parquet')
    pbar.update(len(df_test))

Writing Train Parquet: 100%|██████████| 30000/30000 [00:00<00:00, 82227.66it/s]
Writing Test Parquet: 100%|██████████| 20000/20000 [00:00<00:00, 96822.87it/s]
