<a href="https://colab.research.google.com/github/01PrathamS/DCAI_Practice/blob/main/dcai_lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn import clone

from sklearn.naive_bayes import MultinomialNB

In [3]:
train = pd.read_csv('reviews_train.csv')
test = pd.read_csv('reviews_test.csv')

train.head()

Unnamed: 0,review,label
0,Based on all the negative comments about Taste...,good
1,I still have not received this. Obviously I c...,bad
2,</tr>The magazine is not worth the cost of sub...,good
3,This magazine is basically ads. Kindve worthle...,bad
4,"The only thing I've recieved, so far, is the b...",bad


## Training a Baseline Model

In [5]:
sgd_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [7]:
sgd_clf.fit(train['review'], train['label'])

## Evaluating model accuracy

In [9]:
def evaluate(clf):
  pred = clf.predict(test['review'])
  acc = metrics.accuracy_score(test['label'], pred)
  print(f'Accuracy: {100*acc:.1f}%')

In [10]:
evaluate(sgd_clf)

Accuracy: 76.2%


## Trying a Different Model

In [13]:
nb_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [14]:
nb_clf.fit(train['review'], train['label'])

evaluate(nb_clf)

Accuracy: 85.3%


## Taking a closer look at the data

In [16]:
for i in range(10, 21):
  print(train.iloc[i].to_dict())

{'review': "</div>It's not the fault of the magazine, I just didn't realize that it was all about the decor of the very wealthy...was hoping for a somewhat broader demographic of homes.<li>replaceWith", 'label': 'good'}
{'review': '<li>dispatchEventBest magazine for current and easy recipes</div>', 'label': 'bad'}
{'review': '<li>onEmptiedBoth my husband and I really enjoy this magazine and read it monthly. We ready it cover to cover each month.<th>verbose</th>', 'label': 'bad'}
{'review': 'This magazine is filled with amazing recipes.  Every month is an inspiration.</div>', 'label': 'bad'}
{'review': 'I love this magazine. I read it cover to cover, very informative.\nto many words required for feedback, filler words for feed back', 'label': 'good'}
{'review': 'She loves this magazine.  She is a 10 year old avid reader  who I have purchased this for since she was 8 or so.  it is an occasion, I guess, when it arrives in the mail.', 'label': 'good'}
{'review': "Magazine is quite nice jus

In [17]:
train.shape

(6666, 2)

### It looks like there's some HTML tags in the dataset, HTML wasn't correctly parsed in all cases.

In [23]:
def is_bad_data(review: str) -> bool:
  html_tags = html_tags = [
    '<br>',
    '<tr>',
    '</tr>',
    '<body>',
    '</body>',
    '<a>',
    '<abbr>',
    '<address>',
    '<article>',
    '<aside>',
    '<audio>',
    '<b>',
    '<blockquote>',
    '<canvas>',
    '<center>',
    '<code>',
    '<div>',
    '<em>',
    '<fieldset>',
    '<figure>',
    '<footer>',
    '<h1>',
    '<h2>',
    '<h3>',
    '<h4>',
    '<h5>',
    '<h6>',
    '<head>',
    '<hr>',
    '<html>',
    '<img>',
    '<input>',
    '<li>',
    '<link>',
    '<ol>',
    '<p>',
    '<pre>',
    '<script>',
    '<section>',
    '<select>',
    '<style>',
    '<table>',
    '<tbody>',
    '<td>',
    '<th>',
    '<title>',
    '<ul>',
    '<var>',
    '<video>',
]
  for tag in html_tags:
    if tag in review:
      return True
  return False

clean_train = train[~train['review'].apply(is_bad_data)]
clean_train.shape

(5761, 2)

In [24]:
sgd_clf_clean = clone(sgd_clf)

In [25]:
_ = sgd_clf_clean.fit(clean_train['review'], clean_train['label'])

In [26]:
evaluate(sgd_clf_clean)

Accuracy: 84.6%


## Advanced: Training a Transformer model with HuggingFace

In [28]:
!pip install torch transformers datasets
!pip install accelerate -U

Collecting transformers
  Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Download

In [29]:
import numpy as np
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import datasets
from datasets import Dataset, DatasetDict, ClassLabel

In [30]:
label_map = {'bad':0, 'good':1}
dataset_train = Dataset.from_dict({"label": train["label"].map(label_map), "text": train["review"].values})
dataset_test = Dataset.from_dict({"label": test["label"].map(label_map), "text": test["review"].values})

In [35]:
model_name = 'distilbert-base-uncased'

max_training_steps = 10

model_folder = 'test_trainer'

In [42]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

train_tokenized_dataset = dataset_train.map(tokenize_function, batched=True)
train_tokenized_dataset = train_tokenized_dataset.cast_column("label", ClassLabel(names = ["0", "1"]))

test_tokenized_dataset = dataset_test.map(tokenize_function, batched=True)
test_tokenized_dataset = test_tokenized_dataset.cast_column("label", ClassLabel(names = ["0", "1"]))

training_args = TrainingArguments(max_steps=max_training_steps, output_dir=model_folder)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
)

In [42]:
trainer.train()

In [42]:
pred_probs = trainer.predict(test_tokenized_dataset).predictions
pred_classes = np.argmax(pred_probs, axis=1)
print(f"Error rate of predictions: {np.mean(pred_classes != test_tokenized_dataset['label'])}")