# Basic Natural Language Processing: spaCy Tokenizer, Topic Modeling, Text Classification Model

In [1]:
import pandas as pd
df = pd.read_csv(r"C:\Users\user\Documents\RenewableEnergy_patents_2023_abstract_cpc.csv", index_col = 0)

In [2]:
df.head(1)

Unnamed: 0,id,title,assignee,inventor,priority_date,filing_date,publication_date,grant_date,result_link,representative_figure_link,...,kind_code,abstract,cpc_1,cpc_2,cpc_3,cpc_1_num,cpc_2_num,cpc_3_num,file_grant_days,cpc_1_GHY
0,US11967653B2,Phased solar power supply system,"Ampt, Llc",['Anatoli Ledenev'],2013-03-15,2023-09-05,2024-04-23,2024-04-23,https://patents.google.com/patent/US11967653B2/en,https://patentimages.storage.googleapis.com/c0...,...,B2,A high efficiency solar power system combining...,"['F', 'G', 'H', 'Y']","['H02', 'F03', 'G05', 'F05', 'Y02']","['H02J', 'F03B', 'F03D', 'G05B', 'H02S', 'F05B...",4,5,8,231,1


## Customized Tokenizer: spaCy

In [2]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy.language import Language
from spacy.util import filter_spans

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define rule-based matcher
matcher = Matcher(nlp.vocab)
patterns = [
    [{'POS': 'NOUN'}, {'POS': 'NOUN'}, {'POS': 'NOUN'}],
    [{'POS': 'NOUN'}, {'POS': 'NOUN'}],
    [{'POS': 'ADJ'}, {'POS': 'NOUN'}]
]
matcher.add("Patterns", patterns)

# Custom pipeline component for rule-based matching
@Language.component('merge_rule_based_matches')
def merge_rule_based_matches(doc):
    matches = matcher(doc)
    spans = [Span(doc, start, end, label=match_id) for match_id, start, end in matches]
    # Filter spans to remove overlaps
    filtered_spans = filter_spans(spans)
    with doc.retokenize() as retokenizer:
        for span in filtered_spans:
            retokenizer.merge(span)
    return doc

# Add the custom component to the pipeline before the 'ner' component
nlp.add_pipe('merge_rule_based_matches', before="ner")

# Custom tokenizer to handle hyphenated words and integrated with rule-based matching
def custom_tokenizer(nlp):
    prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) # use default
    suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) # use default
    infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes + [r'\w+-\w+'])
    return spacy.tokenizer.Tokenizer(nlp.vocab, 
                                     prefix_search = prefix_re.search,
                                     suffix_search = suffix_re.search,
                                     infix_finditer=infix_re.finditer, 
                                     token_match=None)

nlp.tokenizer = custom_tokenizer(nlp)

In [3]:
# Preprocessing

# Words to exclude
exclude_words = {"include", "have", "provide", "receive", "method", "comprise", "configure", 
                 "define", "disclose", "determine", "identify", "second", "relate", "enable", 
                 "present", "plurality", "e.g", "thereof", "involve", "describe", "substantially",
                 "particularly", "1", "2", "relative", "particular", "aspect", "correspond"}

def custom_tokenizer_preprocessed(text):
    doc = nlp(text)
    tokens = [
        token.lemma_.lower().strip()
        for token in doc
        if token.lemma_.lower().strip() not in exclude_words
        and not token.is_stop
        and not token.is_punct
        and not token.is_space
    ]
    return tokens

## Bag of Words & Tf-Idf: Scikit learn + spaCy Tokenizer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Use CountVectorizer with the custom spaCy tokenizer
vectorizer = CountVectorizer(tokenizer=custom_tokenizer_preprocessed, min_df=5, max_df=100, ngram_range=(1,1))
bof = vectorizer.fit_transform(df['abstract'])

# Use TfidfTransformer to transform bag of words
tfidf = TfidfTransformer()
tfidf_bof = tfidf.fit_transform(bof)



## Topic Modeling with scikit learn 

In [6]:
from sklearn.decomposition import LatentDirichletAllocation

# Perform topic modeling using Latent Dirichlet Allocation (LDA)
lda = LatentDirichletAllocation(n_components=20, random_state=42)
document_topics = lda.fit(tfidf_bof)

In [7]:
# Inspect the topic modeling results
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = f"Topic #{topic_idx}: "
        message += " | ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Print top words in each topic
n_top_words = 10
tf_feature_names = vectorizer.get_feature_names_out()
print_top_words(lda, tf_feature_names, n_top_words)

Topic #0: reduce | content | prevent | connect | hold | report | electrically | substrate | utilize | convert
Topic #1: second set | object | first set | channel | array | detect | automatically | area | number | mean
Topic #2: electronic device | component | power | battery | represent | face | use | technology | production | material
Topic #3: base | select | image | value | neural network | signal | set | device | compute | detect
Topic #4: couple | pattern | say | instruction | position | space | source | measurement | contain | apart
Topic #5: wireless | print | responsive | share | access | generate | client device | drive | array | network
Topic #6: emit | housing | mount | invention | couple | light | core | function | pixel | transistor
Topic #7: display | datum | base | improve | information | signal | interest | environment | apparatus | resource
Topic #8: extend | frame | opening | portion | base | accord | distribute | surface | air | datum
Topic #9: form | layer | display

## Text Classification Model: Predicting CPC code

In [5]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

#### Logistic Regression: using BoW as features

In [10]:
# target, features, train test split
y = df['cpc_1_GHY']
X = bof
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=42)

# cross validation and grid search settings
cv = KFold(n_splits=5, shuffle=True, random_state=0)
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}

# grid search
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=cv, scoring='accuracy', n_jobs = -1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_
print("best model hyperparameter: ", best_params)
print("best validation accuracy: ", best_score)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("test accuracy: ", accuracy)

best model hyperparameter:  {'C': 0.001}
best validation accuracy:  0.6663641863278887
test accuracy:  0.5833333333333334


#### Logistic Regression: using Tf-Idf as features

In [6]:
# target, features, train test split
y = df['cpc_1_GHY']
X = tfidf_bof
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=42)

# cross validation and grid search settings
cv = KFold(n_splits=5, shuffle=True, random_state=0)
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}

# grid search
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=cv, scoring='accuracy', n_jobs = -1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_
print("best model hyperparameter: ", best_params)
print("best validation accuracy: ", best_score)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("test accuracy: ", accuracy)

best model hyperparameter:  {'C': 0.001}
best validation accuracy:  0.6663641863278887
test accuracy:  0.5833333333333334


#### Support Vector Classifier: using BoW as features

In [8]:
# target, features, train test split
y = df['cpc_1_GHY']
X = bof
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=42)

# cross validation and grid search settings
cv = KFold(n_splits=5, shuffle=True, random_state=0)
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 
              'gamma':['scale', 'auto'], 
              'kernel': ['linear', 'poly', 'rbf']}

# grid search
grid_search = GridSearchCV(SVC(), param_grid, cv=cv, scoring='accuracy', n_jobs = -1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_
print("best model hyperparameter: ", best_params)
print("best validation accuracy: ", best_score)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("test accuracy: ", accuracy)

best model hyperparameter:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
best validation accuracy:  0.6732607380520267
test accuracy:  0.5833333333333334


#### Support Vector Classifier: using Tf-Idf as features

In [9]:
# target, features, train test split
y = df['cpc_1_GHY']
X = tfidf_bof
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True, random_state=42)

# cross validation and grid search settings
cv = KFold(n_splits=5, shuffle=True, random_state=0)
param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 
              'gamma':['scale', 'auto'], 
              'kernel': ['linear', 'poly', 'rbf']}

# grid search
grid_search = GridSearchCV(SVC(), param_grid, cv=cv, scoring='accuracy', n_jobs = -1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
best_score = grid_search.best_score_
best_params = grid_search.best_params_
print("best model hyperparameter: ", best_params)
print("best validation accuracy: ", best_score)

# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("test accuracy: ", accuracy)

best model hyperparameter:  {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
best validation accuracy:  0.6664246823956443
test accuracy:  0.5833333333333334
