# Environment and Imports

In [1]:
import pandas as pd

In [2]:
import codecs

In [3]:
import os

In [4]:
os.chdir("..")

In [5]:
import json

# Load Data

## Load ro_sent

In [15]:
RO_SENT_TRAIN_PATH = "data/ro_sent_train.csv"

In [16]:
RO_SENT_TEST_PATH = "data/ro_sent_test.csv"

In [17]:
def load_ro_sent_train(path):
    return pd.read_csv(path, encoding="utf-8", index_col="index")

In [18]:
def load_ro_sent_test(path):
    return pd.read_csv(path, encoding="utf-8",  index_col="Unnamed: 0")

In [139]:
ro_sent_train = load_ro_sent_train(RO_SENT_TRAIN_PATH)

In [43]:
ro_sent_test = load_ro_sent_test(RO_SENT_TEST_PATH)

## Load LaRoSeDa

In [12]:
LAROSEDA_TRAIN_PATH = "data/laroseda_train.json"

In [13]:
LAROSEDA_TEST_PATH = "data/laroseda_test.json"

In [14]:
def load_laroseda(path):
    with codecs.open(LAROSEDA_TRAIN_PATH, "r", encoding="utf-8") as infile:
        laroseda_json = json.load(infile)
    return pd.DataFrame.from_dict(laroseda_json["reviews"]).set_index("index")

In [95]:
laroseda_train = load_laroseda(LAROSEDA_TRAIN_PATH)

In [96]:
laroseda_test = load_laroseda(LAROSEDA_TEST_PATH)

# Explore and Combine

In [105]:
ro_sent_train.head()

Unnamed: 0_level_0,text,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,acest document mi-a deschis cu adevarat ochii ...,1
1,tine mancarea rece. ce altceva ii mai trebuie?...,1
2,excelent\nrecomand!,1
3,"ca un rocker imbatranit, acest film mentioneaz...",1
4,"ei bine, a facut o groaza veche si foarte intu...",1


In [98]:
laroseda_train.head()

Unnamed: 0_level_0,title,content,starRating
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11262,Foarte slab,ca aspect este foarte frumoasa dar cine vrea s...,1
3890,Foarte multumit,se incarca repede si tine 3 incarcari complete...,5
9413,Țeapa de zile mari!!!,in primul rand nu este de stica dupa cum spune...,1
9350,Nu merita cumparate,nu merita cumparate... sunt create cu limitare...,1
7126,Recomand,un ceas excelent. face cam tot ce ai nevoie ca...,5


In [99]:
# sanitize title
laroseda_train["title"] = laroseda_train["title"].str.replace('[^ \w+]', '', regex=True)

In [100]:
laroseda_train.head()

Unnamed: 0_level_0,title,content,starRating
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11262,Foarte slab,ca aspect este foarte frumoasa dar cine vrea s...,1
3890,Foarte multumit,se incarca repede si tine 3 incarcari complete...,5
9413,Țeapa de zile mari,in primul rand nu este de stica dupa cum spune...,1
9350,Nu merita cumparate,nu merita cumparate... sunt create cu limitare...,1
7126,Recomand,un ceas excelent. face cam tot ce ai nevoie ca...,5


In [101]:
# add title as sentence
laroseda_train["content"] = laroseda_train["title"] + ". " + laroseda_train["content"]  

In [102]:
laroseda_train.head()

Unnamed: 0_level_0,title,content,starRating
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11262,Foarte slab,Foarte slab. ca aspect este foarte frumoasa da...,1
3890,Foarte multumit,Foarte multumit. se incarca repede si tine 3 i...,5
9413,Țeapa de zile mari,Țeapa de zile mari. in primul rand nu este de ...,1
9350,Nu merita cumparate,Nu merita cumparate. nu merita cumparate... su...,1
7126,Recomand,Recomand. un ceas excelent. face cam tot ce ai...,5


In [103]:
laroseda_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12000 entries, 11262 to 12216
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       12000 non-null  object
 1   content     12000 non-null  object
 2   starRating  12000 non-null  object
dtypes: object(3)
memory usage: 375.0+ KB


In [107]:
laroseda_train["starRating"] = laroseda_train["starRating"].astype(float)

In [108]:
ro_sent_train["label"] = ro_sent_train["label"].astype(float)

In [109]:
# scale star rating
laroseda_train["starRating"] = (laroseda_train["starRating"] - 1) / 4

In [110]:
laroseda_train.describe()

Unnamed: 0,starRating
count,12000.0
mean,0.510896
std,0.458164
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [115]:
laroseda_train = laroseda_train.drop("title", axis=1)

In [116]:
laroseda_train.head()

Unnamed: 0_level_0,content,starRating
index,Unnamed: 1_level_1,Unnamed: 2_level_1
11262,Foarte slab. ca aspect este foarte frumoasa da...,0.0
3890,Foarte multumit. se incarca repede si tine 3 i...,1.0
9413,Țeapa de zile mari. in primul rand nu este de ...,0.0
9350,Nu merita cumparate. nu merita cumparate... su...,0.0
7126,Recomand. un ceas excelent. face cam tot ce ai...,1.0


In [118]:
laroseda_train.columns = ro_sent_train.columns

In [120]:
laroseda_train.head()

Unnamed: 0_level_0,text,label
index,Unnamed: 1_level_1,Unnamed: 2_level_1
11262,Foarte slab. ca aspect este foarte frumoasa da...,0.0
3890,Foarte multumit. se incarca repede si tine 3 i...,1.0
9413,Țeapa de zile mari. in primul rand nu este de ...,0.0
9350,Nu merita cumparate. nu merita cumparate... su...,0.0
7126,Recomand. un ceas excelent. face cam tot ce ai...,1.0


In [121]:
laroseda_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12000 entries, 11262 to 12216
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    12000 non-null  object 
 1   label   12000 non-null  float64
dtypes: float64(1), object(1)
memory usage: 539.3+ KB


In [122]:
ro_sent_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17941 entries, 0 to 6846
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    17651 non-null  object 
 1   label   17941 non-null  float64
dtypes: float64(1), object(1)
memory usage: 420.5+ KB


In [125]:
pd.concat([laroseda_train, ro_sent_train]).info()

<class 'pandas.core.frame.DataFrame'>
Index: 29941 entries, 11262 to 6846
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   text    29651 non-null  object 
 1   label   29941 non-null  float64
dtypes: float64(1), object(1)
memory usage: 701.7+ KB


# Final combine method

In [29]:
def adapt_and_combine(ro_sent_df, laroseda_df):
    # sanitize title
    laroseda_df["title"] = laroseda_df["title"].str.replace('[^ \w+]', '', regex=True)
    
    # add title as sentence
    laroseda_df["content"] = laroseda_df["title"] + ". " + laroseda_df["content"]
    
    # convert label types
    laroseda_df["starRating"] = laroseda_df["starRating"].astype(float)
    ro_sent_df["label"] = ro_sent_df["label"].astype(float)
    
    # rescale star rating
    laroseda_df["starRating"] = (laroseda_df["starRating"] - 1) / 4
    
    # drop columns
    laroseda_df = laroseda_df.drop("title", axis=1)
    
    # rename columns
    laroseda_df.columns = ro_sent_df.columns
    
    # add column for dataset id
    ro_sent_df["dataset"] = "ro_sent"
    laroseda_df["dataset"] = "laroseda"
    
    
    # concat
    df_concat = pd.concat([laroseda_df, ro_sent_df])
    
    # remove nulls and return
    return df_concat.dropna()

In [30]:
# reload data
laroseda_train = load_laroseda(LAROSEDA_TRAIN_PATH)
laroseda_test = load_laroseda(LAROSEDA_TEST_PATH)
ro_sent_test = load_ro_sent_test(RO_SENT_TEST_PATH)
ro_sent_train = load_ro_sent_train(RO_SENT_TRAIN_PATH)

In [31]:
final_train = adapt_and_combine(ro_sent_train, laroseda_train)

In [32]:
final_test = adapt_and_combine(ro_sent_test, laroseda_test)

In [33]:
final_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29651 entries, 11262 to 6846
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   text     29651 non-null  object 
 1   label    29651 non-null  float64
 2   dataset  29651 non-null  object 
dtypes: float64(1), object(2)
memory usage: 926.6+ KB


In [34]:
final_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23005 entries, 11262 to 4827
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   text     23005 non-null  object 
 1   label    23005 non-null  float64
 2   dataset  23005 non-null  object 
dtypes: float64(1), object(2)
memory usage: 718.9+ KB


In [35]:
FINAL_TEST_SAVE_PATH = "data/test.csv"

In [36]:
FINAL_TRAIN_SAVE_PATH = "data/train.csv"

In [37]:
final_test.to_csv(FINAL_TEST_SAVE_PATH)

In [38]:
final_train.to_csv(FINAL_TRAIN_SAVE_PATH)