In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [34]:
df = pd.read_csv("data/Dataset1/Books_rating.csv")
df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [35]:
df1 = df[['review/score', 'review/text']]
df1.head()

Unnamed: 0,review/score,review/text
0,4.0,This is only for Julie Strain fans. It's a col...
1,5.0,I don't care much for Dr. Seuss but after read...
2,5.0,"If people become the books they read and if ""t..."
3,4.0,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,4.0,Philip Nel - Dr. Seuss: American IconThis is b...


In [36]:
df_one_star_reviews = df1.loc[df1['review/score'] == 1.0].copy() # extract 1 star reviews
df_five_star_reviews = df1.loc[df1['review/score'] == 5.0].copy() # extract 5 star reviews

# map 1.0 star to 0 for NEGATIVE, 5.0 star to 1 for POSITIVE
df_one_star_reviews['review/score'] = df_one_star_reviews['review/score'].map({1.0: 0, 5.0: 1})
df_five_star_reviews['review/score'] = df_five_star_reviews['review/score'].map({1.0: 0, 5.0: 1})

# create train, val, test splits from separate datasets and combine each split into 1 single train, val or test set
# this function accepts a list of dataframes and the integer number of train, test, val in the final set
def combine_df(list_df, train_size, test_size, val_size):
    
    train_df_list = []
    val_df_list = []
    test_df_list = []
    
    for df in list_df:
        train_df, test_df = train_test_split(df, test_size=int(test_size/len(list_df)), train_size=int((train_size+val_size)/len(list_df)), 
                                             random_state=42, shuffle=True)
        train_df, val_df = train_test_split(df, test_size=int(val_size/len(list_df)), train_size=int(train_size/len(list_df)), 
                                            random_state=42, shuffle=True)
        train_df_list.append(train_df)
        val_df_list.append(val_df)
        test_df_list.append(test_df)
        
    train_df = pd.concat(train_df_list)
    train_df = train_df.reset_index(drop=True)
    val_df = pd.concat(val_df_list)
    val_df = val_df.reset_index(drop=True)
    test_df = pd.concat(test_df_list)
    test_df = test_df.reset_index(drop=True)
    
    return [train_df, val_df, test_df]

sentiment_list_df = combine_df([df_one_star_reviews, df_five_star_reviews], 4000, 1000, 1000) 
print(sentiment_list_df[0].shape, sentiment_list_df[1].shape, sentiment_list_df[2].shape)

(4000, 2) (1000, 2) (1000, 2)


In [37]:
# save the data to csv
sentiment_list_df[0].to_csv('data/Dataset1/train_sentiment.csv', sep='\t')
sentiment_list_df[1].to_csv('data/Dataset1/val_sentiment.csv', sep='\t')
sentiment_list_df[2].to_csv('data/Dataset1/test_sentiment.csv', sep='\t')

In [38]:
df2 = df[['review/summary', 'review/text']]
df2.head()

Unnamed: 0,review/summary,review/text
0,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [39]:
# remove rows with summaries of less than 5 words as they more likely carry less semantic significance
to_drop = []
data_size = df2.shape[0]
for i in range(data_size):
    if len(df2.iloc[i, 0].split(' ')) < 5:
       to_drop.append(i)

df2 = df2.drop(to_drop)
df2 = df2.reset_index(drop=True)
df2.head()

AttributeError: 'float' object has no attribute 'split'

In [None]:
summary_list_df = split_combine_df([df2], 4000, 1000, 1000)
print(summary_list_df[0].shape, summary_list_df[1].shape, summary_list_df[2].shape)

(4000, 2) (1000, 2) (1000, 2)


In [18]:
# save the data to csv
summary_list_df[0].to_csv('data/Dataset1/train_summary.csv', sep='\t')
summary_list_df[1].to_csv('data/Dataset1/val_summary.csv', sep='\t')
summary_list_df[2].to_csv('data/Dataset1/test_summary.csv', sep='\t')