<a href="https://colab.research.google.com/github/Adjouro/DTSA-5798-Final-Project/blob/main/DTSA5798FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 1: Install the packages needed

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Install ktrain
try:
  import ktrain
  from ktrain import text as ktext
except:
  !pip install ktrain
  os.kill(os.getpid(), 9)

# Import TensorFlow
try:
  import tensorflow as tf
except:
  !pip install tensorflow
  os.kill(os.getpid(), 9)

#Section 2: Mount Google Colab

In [None]:
#mount Google Collab
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Section 3: Check if GPU is available

In [None]:
#change the runtime to use the GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Thu Dec 14 18:09:23 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Section 4: Load the data from the provided URL

In [None]:
#Load the data
url = 'https://d3c33hcgiwev3.cloudfront.net/y__VHyJ0SgCIcxd9tNhazg_695452701c3c435691238d1b1c0016f1_news_category_trainingdata.json?Expires=1702684800&Signature=joXkovXj-uL3~~ARx0osVhMpdHGsas~IPsWKZBwMlGRQUyhUIy3OB5gx~vFHHGMJ2bjRWpt-a4rTO8wDGfLc75EdTYG9g~tWcDE7Fmd33svrD5cPoNFLGh-rc1HCW5GQI7XKD8SkxlBdtNgUNpXgGjw8dYpSh-KBSbFkQHB2U4Q_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A'
df = pd.read_json(url)

In [None]:
#let's take a look at the head of the data
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26


#Section 5: Preprocess the data

In [None]:
#Concatenate headline and short_description
# create a new columns with 1 or 0. 1 been article about WELLNESS and HEALTHY LIVING and 0 been the others.
df['combined_text'] = df['headline'] + ' ' + df['short_description']
df['is_HW'] = df['category'].apply(lambda x: 1 if 'WELLNESS' in x or 'HEALTHY LIVING' in x else 0)

In [None]:
#let's take a look at the head of the data
df.head()

Unnamed: 0,category,headline,authors,link,short_description,date,combined_text,is_HW
0,CRIME,There Were 2 Mass Shootings In Texas Last Week...,Melissa Jeltsen,https://www.huffingtonpost.com/entry/texas-ama...,She left her husband. He killed their children...,2018-05-26,There Were 2 Mass Shootings In Texas Last Week...,0
1,ENTERTAINMENT,Will Smith Joins Diplo And Nicky Jam For The 2...,Andy McDonald,https://www.huffingtonpost.com/entry/will-smit...,Of course it has a song.,2018-05-26,Will Smith Joins Diplo And Nicky Jam For The 2...,0
2,ENTERTAINMENT,Hugh Grant Marries For The First Time At Age 57,Ron Dicker,https://www.huffingtonpost.com/entry/hugh-gran...,The actor and his longtime girlfriend Anna Ebe...,2018-05-26,Hugh Grant Marries For The First Time At Age 5...,0
3,ENTERTAINMENT,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,Ron Dicker,https://www.huffingtonpost.com/entry/jim-carre...,The actor gives Dems an ass-kicking for not fi...,2018-05-26,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,0
4,ENTERTAINMENT,Julianna Margulies Uses Donald Trump Poop Bags...,Ron Dicker,https://www.huffingtonpost.com/entry/julianna-...,"The ""Dietland"" actress said using the bags is ...",2018-05-26,Julianna Margulies Uses Donald Trump Poop Bags...,0


#Section 6: Sample the data to balance classes

In [None]:
# Preprocess the data
sample_amount = 2000
HW = df[df['is_HW'] == 1].sample(n=sample_amount, random_state=1)
not_HW = df[df['is_HW'] == 0].sample(n=sample_amount, random_state=1)
review_sample = pd.concat([HW, not_HW])

In [None]:
#let have a look at the summary
#df['is_HW'].describe()
review_sample.describe()

Unnamed: 0,is_HW
count,4000.0
mean,0.5
std,0.500063
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


# Section 7: Install the transformers package and import necessary libraries

In [None]:
try:
  from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
except:
  !pip install transformers
  os.kill(os.getpid(), 9)

# Section 8: Select one of the models and load the tokenizer and model

In [None]:
#I've chosen distilbert-base-uncased because it's a good compromise between size and performance.
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

# Section 9: TokeTokenize and preprocess the input texts for ktrain

In [None]:
# Section 9: Tokenize and preprocess the input texts for ktrain
(train_data, val_data, preproc) = ktext.texts_from_df(train_df=review_sample,
                                                      text_column='combined_text',
                                                      label_columns=['is_HW'],
                                                      maxlen=512,
                                                      preprocess_mode='bert',
                                                      val_pct=0.2, # 20% for validation
                                                      ngram_range=1)


['not_is_HW', 'is_HW']
        not_is_HW  is_HW
171757        1.0    0.0
145157        0.0    1.0
125776        0.0    1.0
168102        0.0    1.0
183337        0.0    1.0
['not_is_HW', 'is_HW']
        not_is_HW  is_HW
177167        0.0    1.0
43059         1.0    0.0
156466        0.0    1.0
186220        0.0    1.0
29335         1.0    0.0
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


# Section 10: Create a ktrain learner with a BERT model

In [None]:
model = ktext.text_classifier('bert', train_data=train_data, preproc=preproc)
learner = ktrain.get_learner(model=model,
                             train_data=train_data,
                             val_data=val_data,
                             batch_size=8)


Is Multi-Label? False
maxlen is 512




done.


# Section 11: Train the model using ktrain's Learner object

In [None]:
# Section 11: Train the model using ktrain's Learner object
learner.fit_onecycle(5e-5, 3)




begin training using onecycle policy with max lr of 5e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7a8c1c599810>

# Section 12: Evaluate the model and print the validation report using ktrain's Learner object

In [None]:
validation = learner.validate(val_data=val_data, class_names=['NOT HEALTHY LIVING OR WELLNESS', 'HEALTHY LIVING OR WELLNESS'])

                                precision    recall  f1-score   support

NOT HEALTHY LIVING OR WELLNESS       0.90      0.89      0.90       388
    HEALTHY LIVING OR WELLNESS       0.90      0.91      0.90       412

                      accuracy                           0.90       800
                     macro avg       0.90      0.90      0.90       800
                  weighted avg       0.90      0.90      0.90       800

