In [73]:
 #importing the necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [75]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv') #first reading into pandas

train_df.columns=['rating','title','review']
test_df.columns=['rating','title','review']

train_df.dropna(inplace=True)
test_df.dropna(inplace=True) #dropping null values

train_df['text_review']=train_df['title']+' '+train_df['review']
test_df['text_review']=test_df['title']+' '+test_df['review'] #combining title and review into one column

train_df.drop(['title','review'],axis=1,inplace=True)

test_df.drop(['title','review'],axis=1,inplace=True)

In [77]:
train_df['rating'].value_counts() #its nearly a 1:1 ratio

rating
2    1799912
1    1799880
Name: count, dtype: int64

In [79]:
#applying startified sampling...keeping the 1:1 ratio intact
#first take 3.75 % samples of class 1 and 2 individually
#then combine that and shuffle to introduce disorderedness

df_class_1=train_df.loc[train_df['rating']==2].sample(frac=0.0075,random_state=42)

df_class_2=train_df.loc[train_df['rating']==1].sample(frac=0.0075,random_state=42)

train_df_sampled=pd.concat([df_class_1,df_class_2]).sample(frac=1,random_state=42).reset_index(drop=True)

train_df_sampled.head()

Unnamed: 0,rating,text_review
0,2,Earcuff Product arived so quickly and I love i...
1,2,Hearts in Atlantis This is Stephen King at his...
2,2,The Faces of Gettysburg-Awesome! This is a won...
3,1,would have been better is not for marketing Th...
4,2,"wonderful film!!!! A wonderful film, just one ..."


In [81]:
train_df_sampled['rating'].value_counts(normalize=True) #classes well balanced, consistent with original data sample proportion

rating
2    0.5
1    0.5
Name: proportion, dtype: float64

In [83]:
df_class_1=test_df.loc[test_df['rating']==2].sample(frac=0.0075,random_state=42)

df_class_2=test_df.loc[test_df['rating']==1].sample(frac=0.0075,random_state=42)

test_df_sampled=pd.concat([df_class_1,df_class_2]).sample(frac=1,random_state=42).reset_index(drop=True)

test_df_sampled.head()

Unnamed: 0,rating,text_review
0,1,Beware..junk Cheaply made and rushed onto the ...
1,2,Great Insight My boss (self-made restaurant ow...
2,1,Design flaw renders unit unusable Before you b...
3,2,Good resource This is a good resource that ser...
4,1,Very Disappointed Considering Speedo's reputat...


In [85]:
test_df_sampled['rating'].value_counts(normalize=True) #classes well balanced, consistent with original data sample proportion

rating
1    0.5
2    0.5
Name: proportion, dtype: float64

In [87]:
print(len(test_df_sampled))

3000


In [89]:
from datasets import Dataset

dataset=Dataset.from_pandas(train_df_sampled) #converting to hugging face dataset now
dataset_test=Dataset.from_pandas(test_df_sampled)

In [91]:
import re
def remove_punc(batch):
  batch['text_review']=[re.sub(r'[^\w\s]','',example) for example in batch['text_review']] #removing punctuation
  return batch

dataset=dataset.map(remove_punc,batched=True)
dataset_test=dataset_test.map(remove_punc,batched=True)

Map:   0%|          | 0/26998 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [93]:
from transformers import AutoTokenizer

model='bert-base-uncased' #using pre trained tokenizer of BERT

tokenizer=AutoTokenizer.from_pretrained(model)


In [95]:
def tokenize_text(batch):
  return tokenizer(batch['text_review'],padding='max_length',max_length=512,truncation=True)

dataset=dataset.map(tokenize_text,batched=True) #batch tokenization
dataset_test=dataset_test.map(tokenize_text,batched=True)

Map:   0%|          | 0/26998 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [97]:
dataset.save_to_disk('./dataset_train') #saving the preprocessed dataset to disk 
dataset_test.save_to_disk('./dataset_test')

Saving the dataset (0/1 shards):   0%|          | 0/26998 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3000 [00:00<?, ? examples/s]

In [99]:
print(tokenizer.vocab_size)

30522
