In [1]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

In [3]:
SEED = 2505

In [4]:
def set_seeds(seed):
  '''Set seeds for reproducibility'''
  np.random.seed(seed)
  random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed) # multi-GPU

In [5]:
set_seeds(seed=SEED)

In [6]:
# set device
cuda = True
device = torch.device("cuda" if (
    torch.cuda.is_available() and cuda) else "cpu")

torch.set_default_tensor_type("torch.FloatTensor")
if device.type == 'cuda':
  torch.set_default_tensor_type("torch.cuda.FloatTensor")

print(device)

cpu


In [7]:
# load data
url = "https://raw.githubusercontent.com/GokuMohandas/MadeWithML/main/datasets/news.csv"
df = pd.read_csv(url, header=0)
df = df.sample(frac=1).reset_index(drop=True)
df.head()

Unnamed: 0,title,category
0,Yanks star swats sex-tortion bid,Sports
1,"Mideast, N. Africa terror warning set",World
2,Landed U.S.-Russian ISS Crew Savor Smell of Ea...,Sci/Tech
3,Pinochet's bad news birthday,World
4,"Olympics: Soldier Shot Dead, Policeman Held",Sports


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
STOPWORDS = stopwords.words('english')
print(STOPWORDS[:5])

['i', 'me', 'my', 'myself', 'we']


In [10]:
porter = PorterStemmer()

In [11]:
def preprocess(text, stopwords=STOPWORDS):
  ''' Conditional preprocessing on our text unique to our task'''

  # lower
  text = text.lower()

  # remove stopwords
  pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
  text = pattern.sub("", text)

  # remove words in parenthesis
  text = re.sub(r"\([^)]*\)", "", text)

  # spacing and filters
  text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)  # separate punctuation tied to words
  text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
  text = re.sub(" +", " ", text)  # remove multiple spaces
  text = text.strip()

  return text


In [12]:
# sample
text = "Great week for the NYSE!"
preprocess(text)

'great week nyse'

In [13]:
# apply to dataframe
preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)
print(f'{df.title.values[1]} \n\n{preprocessed_df.title.values[1]}')




## Split data

In [14]:
import collections
from sklearn.model_selection import train_test_split

In [15]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15

In [16]:
def train_val_test_split(X, y, TRAIN_SIZE):
  ''' Split data into data splits'''
  X_train, X_, y_train, y_ = train_test_split(X, y, train_size = TRAIN_SIZE, stratify=y)
  X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
  return X_train, X_val, X_test, y_train, y_val, y_test

In [17]:
# splitting 
X = preprocessed_df['title'].values
y = preprocessed_df['category'].values

In [18]:
# create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(X=X, y=y, 
                                                                      TRAIN_SIZE=TRAIN_SIZE)
print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[0]} → {y_train[0]}")

X_train: (84000,), y_train: (84000,)
X_val: (18000,), y_val: (18000,)
X_test: (18000,), y_test: (18000,)
Sample point: pope beatifies emperor nun three others → World


## Label Encoding

In [19]:
import itertools

In [20]:
from pandas.io import json
class LabelEncoder:
  ''' Label encoder for tag labels '''
  
  def __init__(self, class_to_index={}) -> None:
      self.class_to_index = class_to_index
      self.index_to_class = {v: k for k, v in self.class_to_index.items()}
      self.classes = list(self.class_to_index.keys())

  def __len__(self):
    return len(self.class_to_index)

  def __str__(self):
    return f'<LabelEncoder(num_class={len(self)})>'

  def fit(self, y):
    classes = np.unique(y)
    for i, class_ in enumerate(classes):
      self.class_to_index[class_] = i
    self.index_to_class = {v: k for k, v in self.class_to_index.items()}
    self.classes = list(self.class_to_index.keys())
    return self

  def encode(self, y):
    encoded = np.zeros((len(y)), dtype=int)
    for i, item in enumerate(y):
      encoded[i] = self.class_to_index[item]
    return encoded
  
  def decode(self, y):
    classes = []
    for _, item in enumerate(y):
      classes.append(self.index_to_class[item])
    return classes

  def save(self, fp):
    with open(fp, 'w'):
      contents = {'class_to_index': self.class_to_index}
      json.dumps(contents, fp, indent=4, sort_keys=False)

  @classmethod
  def load(cls, fp):
    with open(fp, 'r'):
      kwargs = json.load(fp=fp)
    return cls(**kwargs)

In [21]:
# encode
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
NUM_CLASSES = len(label_encoder)
label_encoder.class_to_index

{'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}

In [22]:
NUM_CLASSES

4

In [23]:
# convert labels to tokens
print(f'y_train[0]: {y_train[0]}')
y_train = label_encoder.encode(y_train)
y_val = label_encoder.encode(y_val)
y_test = label_encoder.encode(y_test)
print (f"y_train[0]: {y_train[0]}")

y_train[0]: World
y_train[0]: 3


In [24]:
# class weights
counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i , count in enumerate(counts)}
print (f"counts: {counts}\nweights: {class_weights}")

counts: [21000 21000 21000 21000]
weights: {0: 4.761904761904762e-05, 1: 4.761904761904762e-05, 2: 4.761904761904762e-05, 3: 4.761904761904762e-05}


In [25]:
print(label_encoder)

<LabelEncoder(num_class=4)>


## Tokenizer

In [26]:
import json
from collections import Counter
from more_itertools import take

In [27]:
class Tokenizer:
  def __init__(self, char_level, num_tokens=None,
               pad_token="<PAD>", oov_token="<UNK>",
               token_to_index=None) -> None:
    self.char_level = char_level
    self.separator = "" if self.char_level else " "
    if num_tokens: num_tokens -= 2 # pad + unk tokens
    self.num_tokens = num_tokens
    self.pad_token = pad_token
    self.oov_token = oov_token
    if not token_to_index:
      token_to_index = {pad_token: 0, oov_token: 1}
    self.token_to_index = token_to_index
    self.index_to_token = {v: k for k,v in self.token_to_index.items()}

  def __len__(self):
    return len(self.token_to_index)

  def __str__(self) -> str:
    return f'Tokenizer(num_tokens={len(self)})'

  def fit_on_texts(self, texts):
    if not self.char_level:
      texts = [text.split(" ") for text in texts]

    all_tokens = [token for text in texts for token in text] 
    counts = Counter(all_tokens).most_common(self.num_tokens)
    # print(counts)
    # print(len(counts))

    
    self.min_token_freq = counts[-1][1]
    
    for token, count in counts:
      index = len(self)
      # print(index)
      # print(self.token_to_index)
      # print(self.index_to_token)
      self.token_to_index[token] = index
      self.index_to_token[index] = token
    return self
  
  def texts_to_sequences(self, texts):
    sequences = []
    for text in texts:
      if not self.char_level:
        text = text.split(" ")
      sequence = []
      for token in text:
        sequence.append(self.token_to_index.get(
            token, self.token_t0_index[self.oov_token]
        ))
      sequences.append(np.asarray(sequence))
    return sequences


In [28]:
tokenizer = Tokenizer(char_level=False, num_tokens=200)
tokenizer.fit_on_texts(texts=X_train)

<__main__.Tokenizer at 0x7f2b41e99b50>

# Git Push

In [29]:
!git init

Initialized empty Git repository in /content/.git/


In [30]:
# Add your email to the global config-file
!git config --global user.email "olutomilayodolapo@gmail.com"

# Add your email to the global config-file
!git config --global user.name "AmazingGrace-D"

In [31]:
!git add "/content/drive/MyDrive/Colab Notebooks/CNN text.ipynb"

fatal: pathspec '/content/drive/MyDrive/Colab Notebooks/CNN text.ipynb' did not match any files


In [32]:
!git commit -m "CNN for NLP"

On branch master

Initial commit

Untracked files:
	[31m.config/[m
	[31msample_data/[m

nothing added to commit but untracked files present


In [None]:
!git remote add origin https://AmazingGrace-D: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@github.com/AmazingGrace-D/NLP-architectures.git

In [None]:
!git push -u origin master