### **Import Libraries**

In [96]:
from google.colab import drive
import os
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk import download
import re
download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


### **Reading the Data**

In [84]:
path = '/content/drive/MyDrive/datasets/archive/train.ft.txt'
i = 0
reviews = []
labels = []
with open(path, 'r') as file:
  for line in file:
    if line[0:10] == '__label__2':
      labels.append(5)
    else:
      labels.append(1)
    reviews.append(line[10:])

In [90]:
reviews[205]

' smoothing serum: The product is wonderful for my hair which is very curly and tends to be frizzy. Smooths it and gets rid of allof the friz. Use it once a day, even in damp weather\n'

In [89]:
labels[205]

5

In [86]:
print(len(reviews))
print(len(labels))

3600000
3600000


In [103]:
path = '/content/drive/MyDrive/datasets/archive/test.ft.txt'
i = 0
test_reviews = []
test_labels = []
with open(path, 'r') as file:
  for line in file:
    if line[0:10] == '__label__2':
      test_labels.append(5)
    else:
      test_labels.append(1)
    test_reviews.append(line[10:])

In [104]:
print(len(test_reviews))
print(len(test_labels))

400000
400000


### **Preprocessing**

In [94]:
class Preprocessor:
  '''
    Removes punctuations, numbers, whitespaces
    Converts sentences into lower case
  '''

  def __init__(self, numbers=True, punct=True, stopwords=True, empty_sentence=True):
    self.numbers = numbers
    self.punct = punct
    self.stopwords = stopwords
    self.empty_sentence = empty_sentence

  def apply(self, sentence):
    sentence = sentence.lower()
    if self.numbers:
      sentence = Preprocessor.remove_numbers(sentence)

    if self.punct: 
      sentence = Preprocessor.remove_punct(sentence)

    if self.stopwords: 
      sentence = Preprocessor.remove_stopwords(sentence)

    if self.empty_sentence: 
      sentence = Preprocessor.empty_sentence(sentence)

    if sentence!=None:
        sentence = re.sub(r'\s+', ' ', sentence)

    return sentence
    
  @staticmethod
  def remove_punct(sentence):
    sentence = re.sub(r'[^\w\s]', '', sentence)
    return sentence
  
  @staticmethod
  def remove_numbers(sentence):
    sentence = re.sub(r'[0-9]', '', sentence)
    return sentence

  @staticmethod
  def remove_stopwords(sentence):
    sentence_clean = ' '.join( [ word for word in sentence.split() if word.lower() not in set( stopwords.words('english') ) ] )
    return sentence_clean

  
  @staticmethod
  def empty_sentence(sentence):
    words=sentence.split()
    if (not all(elem == "" for elem in sentence)) and len(sentence)>2:
        return sentence
    else:
        return None

In [95]:
processor = Preprocessor()
clean_sentence = processor.apply(reviews[205])
print(f'{clean_sentence}')

smoothing serum product wonderful hair curly tends frizzy smooths gets rid allof friz use day even damp weather


In [100]:
cleaned_sents = []
for sent in tqdm(reviews[:30000]):
  cleaned_sents.append(processor.apply(sent))

100%|██████████| 30000/30000 [04:50<00:00, 103.16it/s]


In [102]:
cleaned_sents[101]

'great book historical romance lovers engaging count life tess girl young age washed shore isle may memory life stays old caretakers isle caretakers dead young man colin macpherson washes shore colin takes tess back castle helps uncover past oppositesbut know say opposites attract book one many avon true romance series read every book series know like book love rest great book girls still romance novel call trashy romance novel great novel fokr getting romance novels'

In [105]:
cleaned_test_sents = []
for sent in tqdm(test_reviews[:20000]):
  cleaned_test_sents.append(processor.apply(sent))

100%|██████████| 20000/20000 [03:11<00:00, 104.32it/s]


In [106]:
train_data = pd.DataFrame({'review': cleaned_sents, 'sentiment': labels[: 30000]})

In [107]:
test_data = pd.DataFrame({'review': cleaned_test_sents, 'sentiment': test_labels[:20000]})

In [109]:
train_data.head(15)

Unnamed: 0,review,sentiment
0,stuning even nongamer sound track beautiful pa...,5
1,best soundtrack ever anything im reading lot r...,5
2,amazing soundtrack favorite music time hands i...,5
3,excellent soundtrack truly like soundtrack enj...,5
4,remember pull jaw floor hearing youve played g...,5
5,absolute masterpiece quite sure actually takin...,5
6,buyer beware selfpublished book want know whyr...,1
7,glorious story loved whisper wicked saints sto...,5
8,five star book finished reading whisper wicked...,5
9,whispers wicked saints easy read book made wan...,5


In [110]:
test_data.head(10)

Unnamed: 0,review,sentiment
0,great cd lovely pat one great voices generatio...,5
1,one best game music soundtracks game didnt rea...,5
2,batteries died within year bought charger jul ...,1
3,works fine maha energy better check maha energ...,5
4,great nonaudiophile reviewed quite bit combo p...,5
5,dvd player crapped one year also began incorre...,1
6,incorrect disc love style couple years dvd giv...,1
7,dvd menu select problems cannot scroll dvd men...,1
8,unique weird orientalia exotic tales orient dr...,5
9,ultimate guide firstlyi enjoyed format tone bo...,1


### **Importing cleaned data in CSV** 

In [112]:
train_data.to_csv('/content/drive/MyDrive/data/train_preprocessed.csv')
#test_data.to_csv('test_preprocessed.csv')