In [23]:
import re
import pandas as pd
import glob
import os
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

In [24]:
def clean_text(text):
    # Remove punctuations and unnecessary chars
    text1 = re.sub(R'\t', '', text)
    text1 = re.sub(r'[^A-Za-z0-9\s]', '', text)
    text1 = ' '.join(text1.split())
    text1 = text1.lower()
    return text1

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

def stem_text(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return str(stemmed_tokens)

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return str(lemmatized_tokens)

In [25]:
folder = 'Data/text_files/'

new_file = open('clean2019.txt', 'w+')
read_from = open('Data/text_files/2019 Licensing Policy.txt', 'r', encoding='utf8')
lines = read_from.readlines()

lines = [x[:-1] for x in lines]
p = clean_text(str(lines))
new_file.write(p)

print(p)
new_file.close()
read_from.close()

patent portfolio audit and licensing policy confidential 2 patent portfolio audit and licensing policy confidential aminata pouye 01 content portfolio audit 02 licensing opportunities 03 reasons for a licensing policy 3 patent portfolio audit and licensing policy confidential aminata pouye 01 reasons for a licensing policy 4 patent portfolio audit and licensing policy confidential aminata pouye a why a licensing policy 12 patents represent an important investment notably financially for companies and are considered as intangible asset that are of importance in evaluating the value and strength of a company furthermore generally speaking less than 10 of a patent portfolio contribute towards revenues of a company thus there is a need to find returns on investments roi opportunities reasons for a licensing policy 5 patent portfolio audit and licensing policy confidential aminata pouye a why a licensing policy 22 except for protecting the companys products and by extension its revenues a p

In [26]:
tokens = tokenize_text(p)
stems = stem_text(tokens)
lemms = lemmatize_text(tokens)

print(stems)

['patent', 'portfolio', 'audit', 'and', 'licens', 'polici', 'confidenti', '2', 'patent', 'portfolio', 'audit', 'and', 'licens', 'polici', 'confidenti', 'aminata', 'pouy', '01', 'content', 'portfolio', 'audit', '02', 'licens', 'opportun', '03', 'reason', 'for', 'a', 'licens', 'polici', '3', 'patent', 'portfolio', 'audit', 'and', 'licens', 'polici', 'confidenti', 'aminata', 'pouy', '01', 'reason', 'for', 'a', 'licens', 'polici', '4', 'patent', 'portfolio', 'audit', 'and', 'licens', 'polici', 'confidenti', 'aminata', 'pouy', 'a', 'whi', 'a', 'licens', 'polici', '12', 'patent', 'repres', 'an', 'import', 'invest', 'notabl', 'financi', 'for', 'compani', 'and', 'are', 'consid', 'as', 'intang', 'asset', 'that', 'are', 'of', 'import', 'in', 'evalu', 'the', 'valu', 'and', 'strength', 'of', 'a', 'compani', 'furthermor', 'gener', 'speak', 'less', 'than', '10', 'of', 'a', 'patent', 'portfolio', 'contribut', 'toward', 'revenu', 'of', 'a', 'compani', 'thu', 'there', 'is', 'a', 'need', 'to', 'find', '

In [27]:
txt_list = glob.glob(r"Data/text_files/*")

if os.path.exists("all_text.txt"):
  os.remove("all_text.txt")
all_text = open('all_text.txt', 'a', encoding='utf8')

for item in txt_list:
    f = open(item, 'r', encoding='utf8')
    l = f.readlines()
    
    l = [x[:-1] for x in l]
    clt = clean_text(str(l))
    
    all_text.write(clt)

all_text.close()


In [28]:
f = open('all_text.txt', 'r')
l = f.readlines()

if os.path.exists("all_stems.txt"):
  os.remove("all_stems.txt")
all_stems = open('all_stems.txt', 'a', encoding='utf8')

tokens = tokenize_text(str(l))

stems = stem_text(tokens = tokens)
all_stems.write(stems)

all_stems.close()

In [29]:
if os.path.exists("all_lemms.txt"):
  os.remove("all_lemms.txt")
all_lemms = open('all_lemms.txt', 'a', encoding='utf8')

lemms = lemmatize_text(tokens = tokens)
all_lemms.write(lemms)

all_lemms.close()

In [43]:
import sqlite3

conn = sqlite3.connect('proc_data.db')
cursor = conn.cursor()

cursor.execute(
    '''CREATE TABLE IF NOT EXISTS pos_tags(
    id INTEGER PRIMARY KEY,
    token TEXT,
    tag TEXT
    )
''')

cursor.execute(
    '''CREATE TABLE IF NOT EXISTS ner_entities(
    id INTEGER PRIMARY KEY,
    token TEXT,
    entity TEXT
    )
''')

conn.commit()
conn.close()

In [41]:
import spacy
nlp = spacy.load("en_core_web_sm")

f = open('all_text.txt', 'r')
l = f.readlines()
l = [x[:-1] for x in l]

doc = nlp(str(l))

f.close()

In [44]:
# Running multiple times will overwrite 
conn = sqlite3.connect('proc_data.db')
cursor = conn.cursor()

pos_tags = [(token.text, token.pos_) for token in doc]
for word, tag in pos_tags:
    cursor.execute("INSERT INTO pos_tags (token, tag) VALUES (?, ?)", (word, tag))
    # print(word, tag)

**Below cell is to perform Named Entity Recognition for all the clean text, and still in progress**

In [31]:
for entity in doc.ents:
    print(f"Ent: {entity.text}, Label: {entity.label}")

Ent: 2, Label: 397
Ent: 01, Label: 397
Ent: 02, Label: 397
Ent: 03, Label: 397
Ent: 3, Label: 397
Ent: 01, Label: 397
Ent: 4, Label: 397
Ent: 12, Label: 397
Ent: less than 10, Label: 397
Ent: 5, Label: 397
Ent: 22, Label: 397
Ent: third, Label: 396
Ent: 6, Label: 397
Ent: 02, Label: 397
Ent: 7, Label: 397
Ent: 8, Label: 397
Ent: 13, Label: 397
Ent: 9, Label: 397
Ent: 23, Label: 397
Ent: 10, Label: 397
Ent: 33, Label: 397
Ent: one, Label: 397
Ent: 11, Label: 397
Ent: 12, Label: 397
Ent: 12, Label: 397
Ent: 22, Label: 397
Ent: 13, Label: 397
Ent: 14, Label: 397
Ent: 15, Label: 397
Ent: us, Label: 384
Ent: cdas, Label: 380
Ent: peggi, Label: 384
Ent: four, Label: 397
Ent: ci, Label: 383
Ent: ci, Label: 380
Ent: ci, Label: 384
Ent: as to third, Label: 397
Ent: 50, Label: 397
Ent: third, Label: 396
Ent: 1, Label: 397
Ent: 2, Label: 397
Ent: ci, Label: 383
Ent: 3, Label: 397
Ent: third, Label: 396
Ent: 1, Label: 397
Ent: first, Label: 396
Ent: 2, Label: 397
Ent: 1, Label: 397
Ent: 2, Label: 

In [45]:
cursor.execute("SELECT * FROM pos_tags")
pos_tags = cursor.fetchall()

for row in pos_tags:
    print(row)

conn.close()

(1, '[', 'X')
(2, "'", 'PUNCT')
(3, 'patent', 'NOUN')
(4, 'portfolio', 'NOUN')
(5, 'audit', 'NOUN')
(6, 'and', 'CCONJ')
(7, 'licensing', 'NOUN')
(8, 'policy', 'NOUN')
(9, 'confidential', 'ADJ')
(10, '2', 'NUM')
(11, 'patent', 'NOUN')
(12, 'portfolio', 'NOUN')
(13, 'audit', 'NOUN')
(14, 'and', 'CCONJ')
(15, 'licensing', 'NOUN')
(16, 'policy', 'NOUN')
(17, 'confidential', 'ADJ')
(18, 'aminata', 'PROPN')
(19, 'pouye', 'PROPN')
(20, '01', 'NUM')
(21, 'content', 'NOUN')
(22, 'portfolio', 'NOUN')
(23, 'audit', 'NOUN')
(24, '02', 'NUM')
(25, 'licensing', 'NOUN')
(26, 'opportunities', 'NOUN')
(27, '03', 'NUM')
(28, 'reasons', 'NOUN')
(29, 'for', 'ADP')
(30, 'a', 'DET')
(31, 'licensing', 'NOUN')
(32, 'policy', 'NOUN')
(33, '3', 'NUM')
(34, 'patent', 'NOUN')
(35, 'portfolio', 'NOUN')
(36, 'audit', 'NOUN')
(37, 'and', 'CCONJ')
(38, 'licensing', 'NOUN')
(39, 'policy', 'NOUN')
(40, 'confidential', 'ADJ')
(41, 'aminata', 'PROPN')
(42, 'pouye', 'PROPN')
(43, '01', 'NUM')
(44, 'reasons', 'NOUN')
(45, 