In [1]:
import re
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities

In [2]:
print(get_tokens_with_entities("I come from [Kathmandu valley,](location) [Nepal](location)"))
# [('I', 'O'), ('come', 'O'), ('from', 'O'), ('Kathmandu', 'B-location'), ('valley,', 'I-location'), ('Nepal', 'B-location')]

print(get_tokens_with_entities("[Technos](brand) [39 Inch](display_size) Curved Smart [LED](display_type) TV E39DU2000 With Wallmount"))
# [('Technos', 'B-brand'), ('39', 'B-display_size'), ('Inch', 'I-display_size'), ('Curved', 'O'), ('Smart', 'O'), ('LED', 'B-display_type'), ('TV', 'O'), ('E39DU2000', 'O'), ('With', 'O'), ('Wallmount', 'O')]

[('I', 'O'), ('come', 'O'), ('from', 'O'), ('Kathmandu', 'B-location'), ('valley,', 'I-location'), ('Nepal', 'B-location')]
[('Technos', 'B-brand'), ('39', 'B-display_size'), ('Inch', 'I-display_size'), ('Curved', 'O'), ('Smart', 'O'), ('LED', 'B-display_type'), ('TV', 'O'), ('E39DU2000', 'O'), ('With', 'O'), ('Wallmount', 'O')]


In [3]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# note that I purposefully misspell Kathmandu to Kathamanduu
sample_input = "I have experience in [Java](skill) [python](skill) [Web development](skill)"
tokens, entities = list(zip(*get_tokens_with_entities(sample_input)))
tokenized_input = tokenizer(tokens, is_split_into_words=True)
print("Original tokens           : ", tokens)
print("After subword tokenization: ", tokenizer.convert_ids_to_tokens(tokenized_input['input_ids']))
# Original tokens           :  ('I', 'come', 'from', 'Kathmanduu', 'valley,', 'Nepal')
# After subword tokenization:  ['[CLS]', 'i', 'come', 'from', 'kathmandu', '##u', 'valley', ',', 'nepal', '[SEP]']

Original tokens           :  ('I', 'have', 'experience', 'in', 'Java', 'python', 'Web', 'development')
After subword tokenization:  ['[CLS]', 'i', 'have', 'experience', 'in', 'java', 'python', 'web', 'development', '[SEP]']


In [4]:
class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_hf_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=dm.unique_entities)),
            "id": Value("int32")
        })
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

In [5]:
raw_text = """ Skills: [Python](skill) [Java](skill) [C++,](skill)
Skills: [SQL,](skill) [Excel,](skill) [Data Visualization"](skill)
Skills: [Python,](skill) [Java,](skill) [C++](skill)
Proficient in [JavaScript,](skill) [HTML,](skill) [CSS"](skill)
Experience with [SQL](skill) and [database management"](skill)
Skilled in [data analysis](skill) using [R](skill) and [Excel"](skill)
Familiar with [machine learning](skill) algorithms: SVM, Random Forest,
Proficiency in [MATLAB](skill) and [signal processing](skill)
Experience with AWS services: [EC2,](skill) [S3,](skill) [Lambda](skill)
Skilled in [web development](skill) using [PHP](skill) [MySQL,](skill) and [Laravel"](skill)
Knowledge of Agile methodologies: Scrum, Kanban
Experience in [UI/UX design](skill) using Adobe Creative Suite
Proficient with Microsoft Office Suite: [Word,](skill) [Excel,](skill) [PowerPoint](skill)
Skilled in front-end development: [HTML](skill) [CSS,](skill) [JavaScript"](skill)
Experience in [mobile app development](skill) using [React Native"](skill)
Proficient in [data visualization](skill) with [Tableau](skill) and [Power BI](skill)
Familiarity with version control systems: [Git,](skill) [SVN,] (skill)
Experience with backend development frameworks: [Django,](skill) [Flask"](skill)
Skilled in data analysis and visualization using Python libraries: [Pandas,](skill) [Matplotlib,](skill) [Seaborn"](skill)
Proficiency in object-oriented programming languages: [C#](skill), [C++](skill), [Java"](skill)
Experience with cloud platforms: [AWS, ](skill) [Azure,](skill) [Google Cloud"](skill)
Skilled in front-end frameworks: [React, ](skill) [Angular, ](skill) [Vue.js"](skill)
Over [5 years](experience) of experience in [Python](skill) programming and [data analysis"](skill)
 Extensive experience in front-end development using HTML, CSS, and JavaScript
[10+ years](experience)  of experience in [cloud computing](skill) and managing scalable infrastructure
 Over [8 years](experience) of experience in responsive design and creating visually appealing interfaces
 Proven track record of successful branding initiatives with over [6 years](experience) of experience
Extensive expertise in visual design principles such as color theory and typography with over [10 years](experience) of experience
[5+ years](experience) of experience in social media marketing and running effective campaigns
 Experienced in content management systems and publishing workflows with a focus on delivering high-quality content
 Over [5 years](experience)  of experience in software development
[10+ years](experience)  of experience in project management
 Extensive experience in data analysis and reporting
 [6 years](experience)  of experience in sales and business development
 Over [8 years](experience) of experience in customer service
 [3+ years](experience)  of experience in financial analysis
 Experienced in marketing strategy with [7 years](experience) of experience
 [10 years](experience)  of experience in healthcare administration
 Over [6 years](experience) of experience in human resources
 [4+ years](experience) of experience in graphic design
 Extensive experience in software testing and quality assurance
 [8 years](experience) of experience in supply chain management
 [5+ years](experience) of experience in digital marketing
"""

dm = NERDataMaker(raw_text.split("\n"))
print(f"total examples = {len(dm)}")
print(dm[0:3])

# total examples = 35
# [{'id': 0, 'ner_tags': [0], 'tokens': ['']}, {'id': 1, 'ner_tags': [2, 3, 0], 'tokens': ['40"', 'LED', 'TV']}, {'id': 2, 'ner_tags': [0, 2, 0, 0, 3, 0], 'tokens': ['Specifications:', '16″', 'HD', 'READY', 'LED', 'TV.']}]


total examples = 44
[{'id': 0, 'ner_tags': [0, 0, 2, 2, 2], 'tokens': ['', 'Skills:', 'Python', 'Java', 'C++,']}, {'id': 1, 'ner_tags': [0, 2, 2, 2, 4], 'tokens': ['Skills:', 'SQL,', 'Excel,', 'Data', 'Visualization"']}, {'id': 2, 'ner_tags': [0, 2, 2, 2], 'tokens': ['Skills:', 'Python,', 'Java,', 'C++']}]


In [6]:
val_text=""" madhu has [Python](skill) and [3 years](experience)
John has [JavaScript](skill) and [5 years](experience) Emma has [Data Analysis](skill) and [8 years](experience) 
David has [C++](skill) and [10 years](experience) Sophia has [Graphic Design](skill) and [2 years](experience)
Daniel has [Financial Modeling](skill) and [6 years](experience) Olivia has [Communication](skill) and [4 years](experience)
James has [Python](skill) and [7 years](experience) Isabella has [Java](skill) and [9 years](experience) 
William has [UI/UX Design](skill) and [1 year](experience) Ethan has [Java](skill) and [6 years](experience)  
"""
vm = NERDataMaker(val_text.split("\n"))

In [None]:
# pip install torch

In [7]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(dm.unique_entities), id2label=dm.id2label, label2id=dm.label2id)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream

In [9]:
# pip install --upgrade accelerate

Collecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
                                              0.0/227.6 kB ? eta -:--:--
     ------------                            71.7/227.6 kB 1.3 MB/s eta 0:00:01
     -------------------------              153.6/227.6 kB 1.5 MB/s eta 0:00:01
     -------------------------------------  225.3/227.6 kB 2.0 MB/s eta 0:00:01
     -------------------------------------- 227.6/227.6 kB 1.4 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3
Note: you may need to restart the kernel to use updated packages.


In [12]:
# pip install transformers==4.28.0

Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
                                              0.0/7.0 MB ? eta -:--:--
                                              0.0/7.0 MB 1.9 MB/s eta 0:00:04
     -                                        0.2/7.0 MB 2.1 MB/s eta 0:00:04
     --                                       0.5/7.0 MB 3.7 MB/s eta 0:00:02
     ----                                     0.8/7.0 MB 4.5 MB/s eta 0:00:02
     ------                                   1.1/7.0 MB 5.3 MB/s eta 0:00:02
     --------                                 1.4/7.0 MB 5.4 MB/s eta 0:00:02
     ----------                               1.8/7.0 MB 5.7 MB/s eta 0:00:01
     ------------                             2.1/7.0 MB 5.9 MB/s eta 0:00:01
     --------------                           2.5/7.0 MB 6.1 MB/s eta 0:00:01
     ----------------                         2.8/7.0 MB 6.2 MB/s eta 0:00:01
     -----------------                        3.1/7.0 MB 

In [8]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=40,
    weight_decay=0.01,
)

train_ds = dm.as_hf_dataset(tokenizer=tokenizer)
valid_ds = vm.as_hf_dataset(tokenizer=tokenizer)

Map:   0%|          | 0/44 [00:00<?, ? examples/s]

Map:   0%|          | 0/7 [00:00<?, ? examples/s]

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=valid_ds, # eval on training set! ONLY for DEMO!!
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.398803
2,No log,1.282101
3,No log,1.226805
4,No log,1.169654
5,No log,1.065957
6,No log,0.946667
7,No log,0.831624
8,No log,0.738544
9,No log,0.671709
10,No log,0.617956


In [11]:
from transformers import pipeline
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")# pass device=0 if using gpu
pipe("""
LOGESH T V

+91 96000 54585 | logeshtv.2003@gmail.com | linkedin.com/in/logeshtv/

 

 

Aspiring full stack developer with an experience in freelancing and internship
for the same. Have hands on experience designing, developing and implementing
applications and solutions using a range of technologies and programming languages.

Seeking to leverage a broad development and hands-on expertise as a full stack

developer.
PERSONAL INFO

Father's Name - Mr .M Vasu
Date Of Birth -12/10/2003
Address -No. 97, meladhanur village,
kodiyum post, tindivanam
taluk, villupuram, pin -604207
EDUCATION

e SRM valliammai engineering college
B.Tech - Artificial Intelligence And
Data Science
MAY 2024 | CHENNAI | cgpa - 8.664

e Spring field matriculation Hr. Sec.

school
MAY 2020 | CHENNAI | mark - 79.16%

e Melmaruvathur adhiparasakthi high
school
MAY 2018 | MELMARUVATHLUR |
mark- 83.4%

TECHNICAL SKILLS

JavaScript, React, Redux, PHP,
Express, NodeJS, RESTAPI's

FULL STACK
DEVELOPER :

DATA SCIENTIST: Data analysis and visualisation ,
Machine learing , Deep learning,
websrapping(BeautifulSoup),
openCV

DATABASE : SQL and MongoDB

ANDROID AND IOS
DEVELOPMENT: _ React Native

PROGRAMMING python, c+#/c,,R.
LANGUAGE :

FAMILIAR : Flutter, Flask, DOCKER

STRENGTHS

e Adaptability

e Creativity

e Team work

e Problem-solving

HOBBIES

« Writing

e Film Direction

« Editing
LANGUAGES KNOWN

e Tamil
e English

 

EXPERIENCE

Software Developer (FREELANCER) :

e freelancer at PeoplePerHour with a rating of 4.5.

e worked on more than 10+ big project.

e recently worked on a project called myreklam, With this other company
can able to post their job and recruit people. which is done using MERN
stack. Works like comment section, google map integration, front end
works and some more works done.
project - https://myreklam.fr/

INTERNSHIP - DEVTOWN :

* POSITION - FULL(MERN) STACK DEVELOPER.
Gained much knowledge while working on real-world problems.
During this internship, | completed many projects which will be listed below in
the project section.

2021-2023

 

JULY 2021- SEPTEMBER 2021

certificate -https://cert.devtown.in/verify/Z12NGHV

INTERNSHIP - WEB STUDENT :
POSITION - FRONT-END DEVELOPER.
During the internship, HR promoted me to team leader, and after that, |
managed the whole team.

while managing | gained knowledge not only about the project and also gained
much knowledge about teamwork and leadership.

PROJECTS

AUGUST 2021- NOVEMBER 21

ZOMATO CLONE :
* completed the whole web application using react, tailwind for front end.
* express and nodejs for server and backend, mongoDB for database.
* amazon S3 bucket for image and video file pipeline. And docker is also used

PROJECT - https://github.com/logeshloki585/Zomato-Master-Repo
IMDB MOVIE ANALYSIS THROUGH WEB SCRAPPING :
* project is done using python libraries such as numpy, pandas. And matplotlib is
used for data visualization .
e For web scrapping library called beautifulSoup is used.
PROJECT - https://colab research. google.com/drive/1uTMf7wZIIf1dpyVRDxq7ZAdjmfa2kvnN?
usp=sharing

INTELLIGENT SURVEILLANCE SYSTEM :

e Project is build using the python library Opencv .

e And implemented the face detection in the web page using the python frame
work tkinter and flask.

* And integrated with QR code scanner for fetching training data from the
database to find the person.

BOOKMYSHOW, NETFLIX AND YOUTUBE CLONE - (FRONT END) :

e front end is build using react .
e And movie data are fetched from the moviesdb api using axios
* projects are mentioned in porfolio

LINKS

   

LINKEDIN - linkedin.com/in/logeshtv/
GITHUB - —_github.com/logeshloki585
PEOPLEPERHOUR - peopleperhour.com/logesh-t_v

 

PORFOLIO - logeshloki585.vercel.app/
""")
# pipe()

[{'entity_group': 'experience',
  'score': 0.88725305,
  'word': '3 years',
  'start': 48,
  'end': 55},
 {'entity_group': 'skill',
  'score': 0.98414165,
  'word': 'java',
  'start': 80,
  'end': 84},
 {'entity_group': 'skill',
  'score': 0.73956484,
  'word': '##script',
  'start': 84,
  'end': 90},
 {'entity_group': 'skill',
  'score': 0.9879808,
  'word': 'react',
  'start': 92,
  'end': 97},
 {'entity_group': 'skill',
  'score': 0.98215955,
  'word': 'node',
  'start': 99,
  'end': 103},
 {'entity_group': 'skill',
  'score': 0.7793602,
  'word': 'j',
  'start': 104,
  'end': 105},
 {'entity_group': 'skill',
  'score': 0.4953148,
  'word': 'computer',
  'start': 142,
  'end': 150}]

In [None]:
data analyst: [Python, R, SQL, Tableau, Machine Learning],
    full stack developer: [Python, JavaScript, React, SQL, Git],
    front end developer: [HTML, CSS, JavaScript, React, UI/UX],
    data scientist: [Python, R, SQL, Machine Learning, Statistics],
    data engineer: [Python, SQL, ETL, Big Data, Data Modeling],
    software engineer: [Java, C++, Python, JavaScript, Git],
    business analyst: [Data Analysis, Business Intelligence, SQL, Tableau],
    product manager: [Product Development, Agile Methodology, Market Research, Strategy],
    UI/UX designer: [User Research, Wireframing, Prototyping, Usability Testing],
    project manager: [Project Planning, Risk Management, Team Leadership, Communication],
    network administrator: [Network Troubleshooting, LAN/WAN Configuration, Network Security, Cisco Routing and Switching],
    systems administrator: [Server Administration, Active Directory Management, Virtualization, Backup and Recovery],
    cybersecurity analyst: [Threat Detection and Response, Vulnerability Assessment, Incident Handling and Forensics, Security Frameworks],
    cloud architect: [Cloud Platforms, Infrastructure as Code, Microservices Architecture, Containerization],
    data architect: [Data Modeling, Database Design, Data Integration, Data Governance],
    machine learning engineer: [Machine Learning Algorithms, Deep Learning, Data Preprocessing, Model Evaluation],
    devops engineer: [Continuous Integration/Deployment, Container Orchestration, Infrastructure Automation, Monitoring and Logging],
    software tester: [Test Planning, Test Automation, Defect Tracking, Regression Testing]