## Dataset : review - auteur

In [1]:
import pandas as pd

In [2]:
users_output_df = pd.read_csv('../../../Data/appdata/rotten/users_output.csv', index_col=0)
users_output_df = users_output_df[['userID', 'movieID', 'target']]
users_output_df = users_output_df.dropna()
users_output_df.head(5)

Unnamed: 0,userID,movieID,target
0,u/Andrew_L._Urban,m/0814255,A fantasy adventure that fuses Greek mythology...
1,u/Louise_Keller,m/0814255,"Uma Thurman as Medusa, the gorgon with a coiff..."
2,u/nan,m/0814255,With a top-notch cast and dazzling special eff...
3,u/Ben_McEachen,m/0814255,Whether audiences will get behind The Lightnin...
4,u/Ethan_Alter,m/0814255,What's really lacking in The Lightning Thief i...


In [3]:
users_output_df.isna().sum()

userID     0
movieID    0
target     0
dtype: int64

In [4]:
occurrences = users_output_df['userID'].value_counts()
occurrences[occurrences >= 2000].__len__()

81

In [5]:
mask = users_output_df['userID'].isin(occurrences[occurrences >= 2_000].index)
users_output_df = users_output_df[mask]
users_output_df = users_output_df[users_output_df['userID'] != 'u/nan']
users_output_df.head()

Unnamed: 0,userID,movieID,target
6,u/Nick_Schager,m/0814255,Harry Potter knockoffs don't come more transpa...
11,u/Roger_Moore,m/0814255,"For what it is and for whom it is intended, it..."
16,u/MaryAnn_Johanson,m/0814255,"[P]erfectly inoffensive, occasionally clever....."
17,u/Mark_Dujsik,m/0814255,"Admirably, the movie isn't bogged down in the ..."
18,u/Nell_Minow,m/0814255,The main thing this movie seems to be missing ...


In [6]:
print(users_output_df.userID.unique())

['u/Nick_Schager' 'u/Roger_Moore' 'u/MaryAnn_Johanson' 'u/Mark_Dujsik'
 'u/Nell_Minow' 'u/Dustin_Putman' 'u/Jeffrey_M._Anderson'
 'u/Kenneth_Turan' 'u/Moira_MacDonald' 'u/Stephen_Whitty' 'u/Jeff_Vice'
 'u/Wesley_Morris' 'u/Cole_Smithey' 'u/Michael_Phillips' 'u/Matt_Pais'
 'u/Roger_Ebert' 'u/Rich_Cline' 'u/Matthew_Turner' 'u/Walter_Chaw'
 'u/Tom_Long' 'u/Brian_Orndorf' 'u/Jim_Judy' 'u/Lou_Lumenick'
 'u/Frank_Swietek' 'u/Chris_Hewitt' 'u/Sara_Michelle_Fetters'
 'u/Peter_Bradshaw' 'u/David_Nusair' 'u/Fred_Topel' 'u/Robin_Clifford'
 'u/A.O._Scott' 'u/Susan_Granger' 'u/Robert_Roten' 'u/John_Beifuss'
 'u/Cynthia_Fuchs' 'u/Michael_Dequina' 'u/Peter_Canavese' 'u/Kevin_Carr'
 'u/Richard_Propes' 'u/Harvey_S._Karten' 'u/Emanuel_Levy'
 'u/Frederic_and_Mary_Ann_Brussat' 'u/Claudia_Puig' 'u/Laura_Clifford'
 'u/Richard_Roeper' 'u/Kyle_Smith' 'u/Ty_Burr' 'u/Josh_Larsen'
 'u/J._R._Jones' 'u/Peter_Rainer' 'u/Mick_LaSalle' 'u/Peter_Howell'
 'u/Tim_Brayton' 'u/Peter_Travers' 'u/Josh_Bell' 'u/Ken_Hanke'
 '

In [7]:
train_size = 1600
test_size = 200
eval_size = 200

train_set, movies_train_set, train_label = [], [], []
test_set, movies_test_set, test_label = [], [], []
eval_set, movies_eval_set, eval_label = [], [], []

for u in users_output_df.userID.unique():
    user_df = users_output_df[users_output_df['userID'] == u]
    sample_df = user_df.sample(n=2000, replace=True)
    movies = list(sample_df['movieID'])
    examples = list(sample_df['target'])
    
    train_set.extend(examples[:train_size])
    test_set.extend(examples[train_size:train_size+test_size])
    eval_set.extend(examples[-eval_size:])
    
    train_label.extend([u]*train_size)
    test_label.extend([u]*test_size)
    eval_label.extend([u]*eval_size)
    
    movies_train_set.extend(movies[:train_size])
    movies_test_set.extend(movies[train_size:train_size+test_size])
    movies_eval_set.extend(movies[-eval_size:])

In [8]:
len(train_set), len(train_label), len(movies_train_set)

(128000, 128000, 128000)

In [9]:
train_set[:5]

["Remember: CPR doesn't properly work on women unless their cleavage is visible.",
 "[The film's] intellectualized sexuality stirs neither the head nor the nether regions.",
 'One-note insider navel-gazing with no aspirations except to excite its adoring base.',
 'Deliver Us From Evil proceeds with a sober clarity that lends credence to its devastating case.',
 "An illuminating examination of [a] still-ongoing conflict, albeit one unlikely to sway those who've already chosen a side in this polarizing debate."]

In [10]:
train_label[:5]

['u/Nick_Schager',
 'u/Nick_Schager',
 'u/Nick_Schager',
 'u/Nick_Schager',
 'u/Nick_Schager']

In [11]:
movies_train_set[:5]

['m/cave',
 'm/mister_foe',
 'm/1196557-my_name_is_bruce',
 'm/deliver_us_from_evil',
 'm/a_whale_of_a_tale_2018']

In [12]:
len(test_set), len(test_label), len(movies_test_set)

(16000, 16000, 16000)

In [13]:
len(eval_set), len(eval_label), len(movies_eval_set)

(16000, 16000, 16000)

In [14]:
users_output_df.userID.unique().__len__()

80

In [15]:
# authorship task
train_df = pd.DataFrame({'text': train_set, 'label': train_label})
test_df  = pd.DataFrame({'text': test_set, 'label': test_label})
eval_df  = pd.DataFrame({'text': eval_set, 'label': eval_label})

In [16]:
train_df.head(5)

Unnamed: 0,text,label
0,Remember: CPR doesn't properly work on women u...,u/Nick_Schager
1,[The film's] intellectualized sexuality stirs ...,u/Nick_Schager
2,One-note insider navel-gazing with no aspirati...,u/Nick_Schager
3,Deliver Us From Evil proceeds with a sober cla...,u/Nick_Schager
4,An illuminating examination of [a] still-ongoi...,u/Nick_Schager


In [17]:
train_df.shape, test_df.shape, eval_df.shape

((128000, 2), (16000, 2), (16000, 2))

In [18]:
dt_dir = '../../../Data/appdata/rotten/'
train_df.to_csv(dt_dir + 'author80_train.csv')
test_df.to_csv(dt_dir + 'author80_test.csv')
eval_df.to_csv(dt_dir + 'author80_eval.csv')

In [19]:
# personalized dataset task
train_df = pd.DataFrame({'text': train_set, 'movieID': movies_train_set, 'label': train_label})
test_df  = pd.DataFrame({'text': test_set, 'movieID': movies_test_set, 'label': test_label})
eval_df  = pd.DataFrame({'text': eval_set, 'movieID': movies_eval_set, 'label': eval_label})

In [20]:
train_df.shape, test_df.shape, eval_df.shape

((128000, 3), (16000, 3), (16000, 3))

In [21]:
train_df.head(5)

Unnamed: 0,text,movieID,label
0,Remember: CPR doesn't properly work on women u...,m/cave,u/Nick_Schager
1,[The film's] intellectualized sexuality stirs ...,m/mister_foe,u/Nick_Schager
2,One-note insider navel-gazing with no aspirati...,m/1196557-my_name_is_bruce,u/Nick_Schager
3,Deliver Us From Evil proceeds with a sober cla...,m/deliver_us_from_evil,u/Nick_Schager
4,An illuminating examination of [a] still-ongoi...,m/a_whale_of_a_tale_2018,u/Nick_Schager


In [22]:
dt_dir = '../../../Data/appdata/rotten/'
train_df.to_csv(dt_dir + 'rotten_reviews80_train.csv')
test_df.to_csv(dt_dir + 'rotten_reviews80_test.csv')
eval_df.to_csv(dt_dir + 'rotten_reviews80_eval.csv')