In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.feature_extraction.text import TfidfVectorizer
from utilities import *
import pandas as pd
import numpy as np 
import json

In [2]:
# pick full or smaller version of dataset
df = pd.read_csv('data/modelready_220423.csv')
# df = pd.read_csv('data/ten_percent.csv')

In [3]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
print(f'tot columns = {len(df.columns)}, numeric type columns = {len(df.select_dtypes(include=numerics).columns)}' ) # not too many non-numeric columns
df.select_dtypes(include = ['object']).head(1)  # print non numeri columns

tot columns = 772, numeric type columns = 765


Unnamed: 0,publication_number,company_name,countries_in_family,publn_nr,primary_cpc,abstract,description_text
0,US-8623043-B1,"Entellus Medical, Inc.",['AU' 'EP' 'CA' 'US'],8623043,A61M29/02,A method of treating a constricted sinus passa...,RELATED APPLICATIONS \n This Application i...


In [4]:
# extract unique countries in the df
unique_values = set()
df['countries_in_family'].apply(lambda x: unique_values.update(x.strip("[]").replace("'", "").split())) 

# Create new columns for each unique value
for value in unique_values:
    # each country has a column (1 if the patent belong to the country 0 otherwise)
    df[value] = df['countries_in_family'].apply(lambda x: 1 if value in x else 0)


In [5]:
df = df[df.abstract.notna()].copy() # drop all samples without abstract
print('missing value in description text' , df.description_text.isna().sum()) # description_text doesn't have mssing vales

missing value in description text 0


In [6]:
# encode company names
df['company_name_encoded'] = df.company_name.astype('category').cat.codes  # encode companies

# remove non-numeric columns
df_columns_dropped = df.drop(['publication_number', 'company_name', 'countries_in_family', 'publn_nr','primary_cpc'], axis = 1)

# f0_ has the same value as commercialization, the other two shouldn't be used
df_columns_dropped = df_columns_dropped.drop(['f0_', 'centrality', 'similarity'], axis = 1)

In [7]:
# remove text as I can't compute min and max on it
text = df_columns_dropped[['abstract', 'description_text']] # putting them aside for later
df_columns_dropped.drop(['abstract', 'description_text'], axis=1, inplace=True)

In [8]:
print(f'missing values = {df_columns_dropped.isna().sum().sum()} ')# some missin values
df_no_missing = df_columns_dropped.fillna(df_columns_dropped.mean()).copy()
print(f'missing values after filling= {df_no_missing.isna().sum().sum()} ')

# extracting what we'll try to predict
y = df_no_missing['commercialized']
df_no_missing.drop('commercialized', axis= 1, inplace=True)

missing values = 88097 
missing values after filling= 0 


In [9]:
# dropping columns where all the value are the same (min = max) they would be zero if I apply min max rescaling
min_eq_max = df_no_missing.columns[df_no_missing.min() == df_no_missing.max()].to_list()
print(f'column with all same values: {min_eq_max}')
df_clean = df_no_missing.drop(min_eq_max, axis=1)

column with all same values: ['dummy_country_US', 'US']


In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42)

#rescale 
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [11]:
# without considering the text  

clf = LogisticRegression(random_state=0, max_iter=2000).fit(X_train_scaled, y_train)  
clf.score(X_test_scaled, y_test)  # accuracy 

0.8904326743752331

In [12]:
# putting text back in
df_clean[['abstract', 'description_text']] = text  

In [13]:

X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42)

# same vectorizer applyied to training and testing

# bag of words for abstract
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust 'max_features' as needed
X_train_ab = encode_text_colum(X_train, 'abstract', vectorizer)
X_test_ab = encode_text_colum(X_test, 'abstract', vectorizer)

# bag of words for description_text
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust 'max_features' as needed
X_train_de = encode_text_colum(X_train_ab, 'description_text', vectorizer)
X_test_de = encode_text_colum(X_test_ab, 'description_text', vectorizer)


#rescale 
scaler = StandardScaler()
scaler.fit(X_train_de)
X_train_scaled = scaler.transform(X_train_de)
X_test_scaled = scaler.transform(X_test_de)

In [12]:
# with abstract and text description vectorized

clf = LogisticRegression(random_state=0, max_iter=2000).fit(X_train_scaled, y_train)  
clf.score(X_test_scaled, y_test)  # accuracy 

0.7566206639313688

## LLM - Bert

In [12]:

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Define your dataset class
class CustomDataset(Dataset):
    def __init__(self, features, texts, labels, tokenizer, max_length):
        self.features = features
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        text = list(self.texts.iloc[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        
        return {
            'features': torch.tensor(self.features.iloc[idx]),
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }



# TODO integrate this for use both features and text in bert

# class CustomBERTClassifier(torch.nn.Module):
#     def __init__(self, num_classes, feature_dim):
#         super(CustomBERTClassifier, self).__init__()
#         self.bert = BertModel.from_pretrained('bert-base-uncased', num_labels=num_classes)
#         self.fc = torch.nn.Linear(feature_dim, 128)  # Adjust as needed
#         self.final_layer = torch.nn.Linear(128, num_classes)

#     def forward(self, input_ids, attention_mask, features):
#         outputs = self.bert(input_ids, attention_mask=attention_mask)
#         pooled_output = outputs.pooler_output

#         # Concatenate BERT output with additional features
#         combined_features = torch.cat([pooled_output, features], dim=1)

#         # Fully connected layer
#         combined_features = torch.relu(self.fc(combined_features))

#         # Final classification layer
#         logits = self.final_layer(combined_features)

#         return logits


# Define your model
class CustomBERTClassifier(torch.nn.Module):
    def __init__(self, num_classes):
        super(CustomBERTClassifier, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        return outputs.logits


In [13]:
df_clean[['abstract', 'description_text']] = text  # incorporate text
X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42)

In [14]:
# load model

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device  = 'cpu'

# Set up your data and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
max_length = 128  # Adjust as needed
num_classes = 2  # Binary classification

# reset indexes, otherwise it doesn't work
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

features = X_train.drop(['abstract', 'description_text'], axis = 1)
texts = X_train[['abstract', 'description_text']]

dataset = CustomDataset(features, texts, y_train, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

model = CustomBERTClassifier(num_classes) #, len(features.columns))
model.to(device)  # Move the model to GPU if available

print('loaded')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


loaded


In [15]:

# Set up your optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 4  # Adjust as needed

for epoch in range(num_epochs):
    model.train()
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        features = batch['features'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
    
        # outputs = model(input_ids, attention_mask=attention_mask, features=features)
        outputs = model(input_ids, attention_mask=attention_mask) #, features=features)
        loss = torch.nn.functional.cross_entropy(outputs, labels)

        loss.backward()
        optimizer.step()

        # explicity memory release otherwise fills all RAM
        del input_ids, attention_mask, features, labels, outputs, loss



In [16]:
# prediction
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

features = X_test.drop(['abstract', 'description_text'], axis = 1)
texts = X_test[['abstract', 'description_text']]

dataset = CustomDataset(features, texts, y_test, tokenizer, max_length)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Make predictions for all elements in the dataset
all_predictions = []

with torch.no_grad():
    for batch in dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        logits = model(input_ids, attention_mask=attention_mask)
        predicted_classes = torch.argmax(logits, dim=1).tolist()

        all_predictions.extend(predicted_classes)

In [17]:
(all_predictions == y_test).sum() / len(y_test)

0.5040096978739277