In [1]:
import pandas as pd
import re

In [2]:
dataset = pd.read_csv('/kaggle/input/sms-data/SMS-Data.csv')
dataset = dataset['text']

In [3]:
dataset.head()

0    Rs.95.15 on Zomato charged via Simpl.\r\n--\r\...
1    Hi! Update your email id through WhatsApp: htt...
2    Lucknow ya Kolkata ? - watch it LIVE with Vi c...
3    Mohd,\nCheck the incredible Acko insurance pol...
4    Hi! You can now get your Vi prepaid invoice em...
Name: text, dtype: object

In [4]:
!pip install spacy
!python -m spacy download en_core_web_sm

[0mCollecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[0m[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [5]:
dataset_arr = []
for sms in dataset:
    sms = re.sub(r'\s', ' ', str(sms))
    sms = re.sub(r'-', ' ', sms)
    sms = re.sub(r'\s+', ' ', sms)
    dataset_arr.append(sms)

In [6]:
dataset_arr[0]

'Rs.95.15 on Zomato charged via Simpl. Food, groceries, commute, or medicines. Buy Now, Pay Later via Simpl. Know More: https://click.getsimpl.com/vyhm/5b611f85 Simpl Pay'

In [7]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')
for i in range(20):
    labels = nlp(dataset_arr[i])
    displacy.render(labels, style="ent", jupyter=True)



In [8]:
len(dataset_arr)

100243

In [9]:
import json
import os
    
with open('/kaggle/input/sms-ner-dataset/annotations.json', 'r') as file:
    data = json.load(file)
    
# with open('/kaggle/input/sms-ner-dataset-new/annotations_new.json', 'r') as file:
#     data = json.load(file)

In [10]:
data = data['annotations']
data = [tuple(i) for i in data]

from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 41)

In [11]:
train_data[0]

('b"Last few hours to apply for BYJU\'S Aptitude Test Win up to 100% Scholarship Get an all India percentile & more Apply Now http://bit.ly/BNAT Scholarship 5"',
 {'entities': [[30, 36, 'TITLE'], [37, 50, 'PURPOSE'], [61, 77, 'PURPOSE']]})

In [12]:
for i in train_data:
    if i[1]['entities'] == []:
        i[1]['entities'] = (0, 0, 'PERSON')
    else:
        for j in range(len(i[1]['entities'])):
            i[1]['entities'][j] = tuple(i[1]['entities'][j])

In [13]:
for i in test_data:
    if i[1]['entities'] == []:
        i[1]['entities'] = (0, 0, 'PERSON')
    else:
        for j in range(len(i[1]['entities'])):
            i[1]['entities'][j] = tuple(i[1]['entities'][j])

In [14]:
print(len(train_data), len(test_data))

88 22


In [15]:
import os
from tqdm import tqdm
import spacy
from spacy.tokens import DocBin

def make_doc_for_data(data):
    nlp = spacy.load("en_core_web_sm")

    db = DocBin() # create a DocBin object

    for text, annot in tqdm(data): # data in previous format
        if(len(text) > 512):
            continue
        doc = nlp.make_doc(text) # create doc object from text
        ents = []
        for start, end, label in annot["entities"]: # add character indexes
            span = doc.char_span(start, end, label=label, alignment_mode="contract")
            if span is None:
                print("Skipping entity")
            else:
                ents.append(span)
        doc.ents = ents # label the text with the ents
        db.add(doc)
        
    return db

make_doc_for_data(train_data).to_disk("train.spacy") # save the docbin object
make_doc_for_data(test_data).to_disk("test.spacy") # save the docbin object

100%|██████████| 88/88 [00:00<00:00, 1837.22it/s]


Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity


100%|██████████| 22/22 [00:00<00:00, 1198.79it/s]

Skipping entity





In [16]:
!pip install spacy_transformers
!python -m spacy init fill-config /kaggle/input/sms-ner-dataset/base_config.cfg config.cfg

Collecting spacy_transformers
  Downloading spacy_transformers-1.2.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.7/191.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting spacy-alignments<1.0.0,>=0.7.2
  Downloading spacy_alignments-0.9.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
Installing collected packages: spacy-alignments, spacy_transformers
Successfully installed spacy-alignments-0.9.0 spacy_transformers-1.2.3
[0m[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [17]:
!python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./test.spacy --gpu-id 0

[38;5;2m✔ Created output directory: output[0m
[38;5;4mℹ Saving to output directory: output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2023-05-09 05:28:31,575] [INFO] Set up nlp object from config
[2023-05-09 05:28:31,588] [INFO] Pipeline: ['transformer', 'ner']
[2023-05-09 05:28:31,593] [INFO] Created vocabulary
[2023-05-09 05:28:31,595] [INFO] Finished initializing nlp object
Downloading (…)lve/main/config.json: 100%|█████| 481/481 [00:00<00:00, 59.0kB/s]
Downloading (…)olve/main/vocab.json: 100%|███| 899k/899k [00:00<00:00, 1.30MB/s]
Downloading (…)olve/main/merges.txt: 100%|████| 456k/456k [00:00<00:00, 890kB/s]
Downloading (…)/main/tokenizer.json: 100%|█| 1.36M/1.36M [00:00<00:00, 7.82MB/s]
Downloading pytorch_model.bin: 100%|█████████| 501M/501M [00:06<00:00, 83.4MB/s]
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'l

In [18]:
import spacy_transformers
nlp1 = spacy.load('/kaggle/working/output/model-best') #load the best model
doc = nlp1(dataset_arr[1000]) # input sample text

spacy.displacy.render(doc, style="ent", jupyter=True) # display in Jupyter

In [19]:
for i in range(1505, 1515):
    doc1 = nlp1(dataset_arr[i]) # input sample text
    spacy.displacy.render(doc1, style="ent", jupyter=True) # display in Jupyter



In [20]:
# Download model-best for future use
import shutil
shutil.make_archive('output', 'zip', '/kaggle/working/')

'/kaggle/working/output.zip'