In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv("data/Dataset1/Books_rating.csv")
df = df.dropna()
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,829814000,Wonderful Worship in Smaller Churches,19.4,AZ0IOBU20TBOP,Rev. Pamela Tinnin,8/10,5.0,991440000,Outstanding Resource for Small Church Pastors,"I just finished the book, &quot;Wonderful Wors..."
1,829814000,Wonderful Worship in Smaller Churches,19.4,A373VVEU6Z9M0N,Dr. Terry W. Dorsett,1/1,5.0,1291766400,Small Churches CAN Have Wonderful Worship,Many small churches feel like they can not hav...
2,829814000,Wonderful Worship in Smaller Churches,19.4,AGKGOH65VTRR4,"Cynthia L. Lajoy ""Cindy La Joy""",1/1,5.0,1248307200,Not Just for Pastors!,I just finished reading this amazing book and ...
3,829814000,Wonderful Worship in Smaller Churches,19.4,A3OQWLU31BU1Y,Maxwell Grant,1/1,5.0,1222560000,Small church pastor? This is the book on worship,I hadn't been a small church pastor very long ...
4,595344550,Whispers of the Wicked Saints,10.95,A3Q12RK71N74LB,Book Reader,7/11,1.0,1117065600,not good,I bought this book because I read some glowing...


In [6]:
df1 = df[['review/score', 'review/text']]
df1.head()

Unnamed: 0,review/score,review/text
0,5.0,"I just finished the book, &quot;Wonderful Wors..."
1,5.0,Many small churches feel like they can not hav...
2,5.0,I just finished reading this amazing book and ...
3,5.0,I hadn't been a small church pastor very long ...
4,1.0,I bought this book because I read some glowing...


In [13]:
df_one_star_reviews = df1.loc[df1['review/score'] == 1.0] # extract 1 star reviews
df_five_star_reviews = df1.loc[df1['review/score'] == 5.0] # extract 5 star reviews

# create train, val, test splits from separate datasets and combine each split into 1 single train, val or test set
# this function accepts a list of dataframes and the integer number of train, test, val in the final set
def split_combine_df(list_df, train_size, test_size, val_size):
    
    train_df_list = []
    val_df_list = []
    test_df_list = []
    
    for df in list_df:
        train_df, test_df = train_test_split(df, test_size=int(test_size/len(list_df)), train_size=int((train_size+val_size)/len(list_df)), 
                                             random_state=42, shuffle=True)
        train_df, val_df = train_test_split(df, test_size=int(val_size/len(list_df)), train_size=int(train_size/len(list_df)), 
                                            random_state=42, shuffle=True)
        train_df_list.append(train_df)
        val_df_list.append(val_df)
        test_df_list.append(test_df)
        
    train_df = pd.concat(train_df_list)
    train_df = train_df.reset_index(drop=True)
    val_df = pd.concat(val_df_list)
    val_df = val_df.reset_index(drop=True)
    test_df = pd.concat(test_df_list)
    test_df = test_df.reset_index(drop=True)
    
    return [train_df, val_df, test_df]

sentiment_list_df = split_combine_df([df_one_star_reviews, df_five_star_reviews], 4000, 1000, 1000) 
print(sentiment_list_df[0].shape, sentiment_list_df[1].shape, sentiment_list_df[2].shape)

(4000, 2) (1000, 2) (1000, 2)


In [14]:
# save the data to csv
sentiment_list_df[0].to_csv('data/Dataset1/train_sentiment.csv', sep='\t')
sentiment_list_df[1].to_csv('data/Dataset1/val_sentiment.csv', sep='\t')
sentiment_list_df[2].to_csv('data/Dataset1/test_sentiment.csv', sep='\t')

In [15]:
df2 = df[['review/summary', 'review/text']]
df2.head()

Unnamed: 0,review/summary,review/text
0,Outstanding Resource for Small Church Pastors,"I just finished the book, &quot;Wonderful Wors..."
1,Small Churches CAN Have Wonderful Worship,Many small churches feel like they can not hav...
2,Not Just for Pastors!,I just finished reading this amazing book and ...
3,Small church pastor? This is the book on worship,I hadn't been a small church pastor very long ...
4,not good,I bought this book because I read some glowing...


In [16]:
# remove rows with summaries of less than 5 words as they more likely carry less semantic significance
to_drop = []
data_size = df2.shape[0]
for i in range(data_size):
    if len(df2.iloc[i, 0].split(' ')) < 5:
       to_drop.append(i)

df2 = df2.drop(to_drop)
df2 = df2.reset_index(drop=True)
df2.head()

Unnamed: 0,review/summary,review/text
0,Outstanding Resource for Small Church Pastors,"I just finished the book, &quot;Wonderful Wors..."
1,Small Churches CAN Have Wonderful Worship,Many small churches feel like they can not hav...
2,Small church pastor? This is the book on worship,I hadn't been a small church pastor very long ...
3,Five stars it not enough,I thought this book was brilliant. The plot wa...
4,Whispers of the Wicked Saints,This was a easy to read book that made me want...


In [None]:
summary_list_df = split_combine_df([df2], 4000, 1000, 1000)
print(summary_list_df[0].shape, summary_list_df[1].shape, summary_list_df[2].shape)

(4000, 2) (1000, 2) (1000, 2)


In [18]:
# save the data to csv
summary_list_df[0].to_csv('data/Dataset1/train_summary.csv', sep='\t')
summary_list_df[1].to_csv('data/Dataset1/val_summary.csv', sep='\t')
summary_list_df[2].to_csv('data/Dataset1/test_summary.csv', sep='\t')