# Install libraries

In [1]:
!python -m pip install -q spacy
!python -m pip install -q PyAudio
!python -m pip install -q SpeechRecognition

In [2]:
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding
import json
import warnings
warnings.filterwarnings('ignore')

# Reading source data and setting up the format

In [3]:
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
    """
    Reading Json file,
    """
    try:
        training_data = []
        lines=[]
        with open(dataturks_JSON_FilePath, 'r') as f:
            lines = f.readlines()

        for line in lines:
            data = json.loads(line)
            text = data['content']
            entities = []
            for annotation in data['annotation']:
                #only a single point in text annotation.
                point = annotation['points'][0]
                labels = annotation['label']
                # handle both list of labels or a single label.
                if not isinstance(labels, list):
                    labels = [labels]

                for label in labels:
                    #dataturks indices are both inclusive [start, end] but spacy is not [start, end)
                    entities.append((point['start'], point['end'] + 1 ,label))


            training_data.append((text, {"entities" : entities}))

        return training_data
    except Exception as e:
        logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
        return None

In [4]:
TRAIN_DATA = convert_dataturks_to_spacy("final30desc.json");

In [5]:
model=None
output_dir=Path("resp")
n_iter=50

In [6]:
if model is not None:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank("en")  # create blank Language class
    print("Created blank 'en' model")

Created blank 'en' model


In [7]:
if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
else:
    ner = nlp.get_pipe("ner")

In [8]:
len(TRAIN_DATA)


30

# Train model

In [9]:
TRAIN_DATA = convert_dataturks_to_spacy("final30desc.json");
import tqdm
# nlp.vocab.vectors.name = 'spacy_trained_vectors'
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    optimizer=nlp.begin_training()
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
            # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
        for text, annotations in (TRAIN_DATA):
            nlp.update(
                [text],
                [annotations],
                sgd=optimizer,
                drop=0.35,
                losses=losses
                       )
        print(losses)          

{'ner': 4337.350130517929}
{'ner': 634.5651251800346}
{'ner': 722.3177680964583}
{'ner': 1098.111472971154}
{'ner': 1103.0673997725794}
{'ner': 951.726580743998}
{'ner': 695.5923958139002}
{'ner': 492.57405676343285}
{'ner': 441.078263142913}
{'ner': 511.7309279446822}
{'ner': 412.61449447974996}
{'ner': 326.4716422115573}
{'ner': 323.0927967987617}
{'ner': 279.431397249477}
{'ner': 725.0329143817407}
{'ner': 263.9552966312307}
{'ner': 233.40600342917986}
{'ner': 211.47649688207224}
{'ner': 215.14370833206914}
{'ner': 244.27733398551783}
{'ner': 229.1636679008736}
{'ner': 196.41548275848214}
{'ner': 308.49059007814174}
{'ner': 176.0630811613709}
{'ner': 186.39994627263647}
{'ner': 183.6657222818459}
{'ner': 169.6387428519817}
{'ner': 159.2624000140167}
{'ner': 187.54218967456578}
{'ner': 149.28536506110962}
{'ner': 136.85936932860093}
{'ner': 110.86366427554984}
{'ner': 111.1474109547951}
{'ner': 117.8600479395765}
{'ner': 153.02023119369665}
{'ner': 148.65238275054656}
{'ner': 152.689

In [10]:
    # save model to output directory
if output_dir is not None:
     output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)


Saved model to resp


# Testing model

In [11]:
wikitext=nlp(" Reference Code- Inf_EXTERNAL_10038992_21Role Designation- Technology AnalystTechnical & Professional requirements- Basic Qualifications- - Experience Range- 3-5 years. At least 2 years of experience and excellent understanding of Machine learning techniques and algorithm such as Neural Networks, Naive Bayes, SVM, Decision Forests, etc.,o NLP, text analytics technologies.,o Common data science toolkits, such as R, Python Data Science Libraries, MatLab, etc. Excellence in at least one of these is highly desirable,o Data visualization tools, such as D3.js, GGplot, etc.,o Query languages such as SQL, Hive.,Good applied statistics skills, such as distributions, statistical testing, regression, etc.,At least 5 years of hands on experience with more than one programming language (Python / Scala/ Java/SQL),Role and responsibilities- - ,You will be responsible for delivering high-value next-generation products on aggressive deadlines and will be required to write high-quality, highly optimized/high-performance and maintainable code that your fellow developers love ,You will be a core member of a team that does whatever it takes to delight customers, take an iterative and result oriented approach to software development. In this position you will provide best-fit architectural solutions for multi-product, multi-project, multi-industry portfolios providing technology consultation and assisting in defining scope and sizing of work ,You will be the anchor in Proof of Concept developments and support opportunity identification and pursuit processes and evangelize Infosys brand ,You will collaborate with some of the best talent in the industry to create and implement innovative high quality solutions, lead and participate in sales and pursuits focused on our clients' business needs ,You will be part of a learning culture, where teamwork and collaboration are encouraged, excellence is rewarded, and diversity is respected and valued ,The role involves high end technology and hence would require you to be proficient in coding as well,Location- Bangalore Job Locations- Bangalore,BangaloreResponsibilites- Ensure effective Design, Development & Validation of activities in line with client needs and architectural requirements.,Ensure continual knowledge management.,Adherence to the organizational guidelines and processesSkills- R, Python, Machine Learning Company Description-Infosys is a leading provider of next generation consulting,technology and outsourcing solutions.We are dedicated to helping organizations,build tomorrows enterprise and advance the way the world works Thats why Forbes ranks us 19th among the top 100 most innovative companies. Our employees partner with clients to transform their business - one conversation; one idea; one insight at a time.While we are at it, some things remain unchanged- the unwavering ethics,transparency and respect behind everything we do. We will always be a company powered by intellect and driven by values.So, if your passion is to build solutions that  really make a difference to enterprises,the community and your world, Infosys is the right place for you.")

In [12]:
nlp3=spacy.load(output_dir)

In [13]:
for word in wikitext.ents:
    print(word.text,word.label_)

3-5 years Experience
2 years Experience
Machine learning Skills
NLP Skills
Python Skills
Data Science Skills
MatLab Skills
Data visualization Skills
SQL Skills
Hive Skills
5 years Experience
Python Skills
Scala/ Skills
Java Skills
Bangalore Location
Bangalore Location
Python Skills
Machine Learning Skills


# Nice View for visulization

In [14]:
from spacy import displacy

In [15]:
displacy.render(wikitext, style="ent",jupyter="True")

## Connect with Speech Recognition system

In [16]:
import pyaudio
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
    info = p.get_device_info_by_index(i)
    print(info['index'], info['name']) # Please change your audio input if channel is not working 

0 USB2.0 High-Speed True HD Audio: Audio (hw:0,0)
1 ReSpeaker 4 Mic Array (UAC1.0): USB Audio (hw:1,0)
2 HDA Intel PCH: ALC1150 Analog (hw:2,0)
3 HDA Intel PCH: ALC1150 Digital (hw:2,1)
4 HDA Intel PCH: ALC1150 Alt Analog (hw:2,2)
5 HDA NVidia: HDMI 0 (hw:3,3)
6 HDA NVidia: HDMI 1 (hw:3,7)
7 HDA NVidia: HDMI 2 (hw:3,8)
8 HDA NVidia: HDMI 3 (hw:3,9)
9 sysdefault
10 front
11 surround40
12 iec958
13 spdif
14 pulse
15 dmix
16 default
17 /dev/dsp


In [17]:
import speech_recognition as sr

r = sr.Recognizer()
with sr.Microphone(device_index=1) as source: # select device_index which is working microphone 
    print ('Say Something!')
    audio = r.listen(source)
    print ('Done!')
    
text = r.recognize_google(audio)
print (text)

Say Something!
Done!
hello I am testing now


In [18]:
import speech_recognition as sr

r = sr.Recognizer()
with sr.Microphone() as source:
    print("Speak Anything :")
    audio = r.listen(source)
    try:
        text = r.recognize_google(audio)
        print("You said : {}".format(text))
        displacy.render(nlp(text), style="ent",jupyter="True")
    except:
        print("Sorry could not recognize what you said")
        


Speak Anything :
You said : I have 3 years of experience
