In [1]:
%load_ext autoreload
%autoreload 2

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from scipy.stats import f_oneway
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score
from utilities import *
from sklearn.feature_selection import f_classif

In [3]:
# pick full or smaller version of dataset
df = pd.read_csv('data/modelready_220423.csv')

# Cleaning the dataset

In [4]:
# print non-numerical columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
print(f'tot columns = {len(df.columns)}, numeric type columns = {len(df.select_dtypes(include=numerics).columns)}' ) # not too many non-numeric columns
df.select_dtypes(include = ['object']).head(1)  

tot columns = 772, numeric type columns = 765


Unnamed: 0,publication_number,company_name,countries_in_family,publn_nr,primary_cpc,abstract,description_text
0,US-8623043-B1,"Entellus Medical, Inc.",['AU' 'EP' 'CA' 'US'],8623043,A61M29/02,A method of treating a constricted sinus passa...,RELATED APPLICATIONS \n This Application i...


In [5]:
# extract unique countries in the df
unique_values = set()
df['countries_in_family'].apply(lambda x: unique_values.update(x.strip("[]").replace("'", "").split())) 

# Create new columns for each unique value
for value in unique_values:
    # each country has a column (1 if the patent belong to the country 0 otherwise)
    df[value] = df['countries_in_family'].apply(lambda x: 1 if value in x else 0)


In [6]:
df = df[df.abstract.notna()].copy() # drop all samples without abstract
print('missing value in description text' , df.description_text.isna().sum()) # description_text doesn't have mssing vales

missing value in description text 0


In [7]:
# encode company names
df['company_name_encoded'] = df.company_name.astype('category').cat.codes  # encode companies

# remove non-numeric columns
df_columns_dropped = df.drop(['publication_number', 'company_name', 'countries_in_family', 'publn_nr','primary_cpc'], axis = 1)

# f0_ has the same value as commercialization, the other two shouldn't be used
df_columns_dropped = df_columns_dropped.drop(['f0_', 'centrality', 'similarity'], axis = 1)

In [8]:
# remove text as I can't compute min and max on it
text = df_columns_dropped[['abstract', 'description_text']] # putting them aside for later
df_columns_dropped.drop(['abstract', 'description_text'], axis=1, inplace=True)

In [9]:
# extracting what we'll try to predict
y = df_columns_dropped['commercialized']
df_columns_dropped.drop('commercialized', axis= 1, inplace=True)

In [10]:
# dropping columns where all the value are the same (only one unique value) they would be zero if I apply min max rescaling
nunique = df_columns_dropped.nunique()
cols_to_drop = nunique[nunique == 1].index
df_clean = df_columns_dropped.drop(cols_to_drop, axis=1)

In [11]:
# splitting train and test and trying best preprocessing on training set
# all preprocessing will be done on X_train and only in the end tested on X_test
X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20)

***

# Tests to find any biases in the dataset

In [12]:
df

Unnamed: 0,publication_number,company_name,commercialized,vpm_patent_score,backward_citations_app,backward_citations_exa,forward_citations,total_nb_claims,nb_indep_claims,family_size,...,EG,JP,ZW,RU,TW,DO,IE,SM,CR,company_name_encoded
0,US-8623043-B1,"Entellus Medical, Inc.",0,0.000000,140,18,22,11,1,18,...,0,0,0,0,0,0,0,0,0,196
1,US-9192748-B2,"Entellus Medical, Inc.",0,0.000000,203,33,2,16,2,5,...,0,0,0,0,0,0,0,0,0,196
2,US-8888686-B2,"Entellus Medical, Inc.",0,0.000000,69,8,2,15,1,2,...,0,0,0,0,0,0,0,0,0,196
3,US-8986340-B2,"Entellus Medical, Inc.",0,0.000000,162,2,3,22,1,23,...,0,1,0,0,0,0,0,0,0,196
4,US-9320876-B2,"Entellus Medical, Inc.",0,0.000000,208,2,0,10,2,18,...,0,0,0,0,0,0,0,0,0,196
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63342,US-7357652-B1,Leviton,1,0.986301,75,12,11,29,3,5,...,0,0,0,0,0,0,0,0,0,326
63343,US-8958680-B2,Leviton,1,0.986301,15,8,2,19,1,7,...,0,0,0,0,0,0,0,0,0,326
63346,US-9515437-B2,Leviton,1,0.986301,64,17,0,19,3,1,...,0,0,0,0,0,0,0,0,0,326
63347,US-9551454-B2,Leviton,1,0.986301,1,64,0,37,5,2,...,0,0,0,0,0,0,0,0,0,326


## Exploring bias towards the **company** that filed the patent

In [14]:
## Find the 2 most frequent companies in the dataset

# Find the most frequent company in the dataset
total_rows = df.shape[0] 
most_frequent_companies = df['company_name'].value_counts()  # Count occurrences of each company
most_frequent_company = most_frequent_companies.idxmax()  # Most frequent company
most_frequent_occurrences = most_frequent_companies.max()  # Count of occurrences of the most frequent company

# Calculate percentage
most_frequent_percentage = (most_frequent_occurrences / total_rows) * 100

print(f"The percentage of '{most_frequent_company}' in the 'company_name' column is: {most_frequent_percentage:.2f}%")

# Find the second most frequent company in the dataset
second_most_frequent_company = most_frequent_companies.index[1]  # Second most frequent company
second_most_frequent_occurrences = most_frequent_companies.iloc[1]  # Count of occurrences of the second most frequent company

# Calculate percentage
second_most_frequent_percentage = (second_most_frequent_occurrences / total_rows) * 100

print(f"The percentage of '{second_most_frequent_company}' in the 'company_name' column is: {second_most_frequent_percentage:.2f}%")

# Find the third most frequent company in the dataset
third_most_frequent_company = most_frequent_companies.index[2]  # Second most frequent company
third_most_frequent_occurrences = most_frequent_companies.iloc[2]  # Count of occurrences of the second most frequent company

# Calculate percentage
third_most_frequent_percentage = (third_most_frequent_occurrences / total_rows) * 100

print(f"The percentage of '{third_most_frequent_company}' in the 'company_name' column is: {third_most_frequent_percentage:.2f}%")



The percentage of 'Abbott' in the 'company_name' column is: 7.77%
The percentage of 'BlackBerry' in the 'company_name' column is: 7.70%
The percentage of 'Nike' in the 'company_name' column is: 6.34%


In [24]:
## Find the most frequent companies in the dataset

# Adjust the range as desired
n = 6

total_rows = df.shape[0] 
most_frequent_companies = df['company_name'].value_counts()  # Count occurrences of each company

for i in range(n):
    most_frequent_company = most_frequent_companies.index[i]  # Most frequent company
    most_frequent_occurrences = most_frequent_companies.iloc[i]  # Count of occurrences

    # Calculate percentage
    most_frequent_percentage = (most_frequent_occurrences / total_rows) * 100

    print(f"The company named '{most_frequent_company}' appears {most_frequent_percentage:.2f}% of the times within the feature 'company_name")


The company named 'Abbott' appears 7.77% of the times within the feature 'company_name
The company named 'BlackBerry' appears 7.70% of the times within the feature 'company_name
The company named 'Nike' appears 6.34% of the times within the feature 'company_name
The company named 'Hyundai Motor Company' appears 5.08% of the times within the feature 'company_name
The company named 'VMware' appears 3.14% of the times within the feature 'company_name
The company named 'HOYA Surgical Optics' appears 3.01% of the times within the feature 'company_name


In [21]:
## Print the success rates of n of the most frequent companies within the DataFrame

company_names = most_frequent_companies.index

# Adjust n as desired
n = 14

# Initialize a counter for the loop
i = 0

for most_frequent_company in company_names:
    if i < n:
        # Filter rows in the DataFrame
        most_frequent_company_rows = df[df['company_name'] == most_frequent_company]

        # Get the indices of the rows corresponding to the most frequent company
        indices_of_most_frequent_company = most_frequent_company_rows.index.tolist()

        # Extract the corresponding y values for the most frequent company
        y_values_for_most_frequent_company = y[indices_of_most_frequent_company].tolist()

        # Calculate the percentage of '1's for the current company
        total_values = len(y_values_for_most_frequent_company)
        count_of_ones = y_values_for_most_frequent_company.count(1)
        percentage_of_ones = (count_of_ones / total_values) * 100

        print(f"The percentage of commercialized patents for the company named '{most_frequent_company}', is: {percentage_of_ones:.2f}%")

        i = i + 1
    else:
        break


The percentage of commercialized patents for the company named Abbott, is: 14.07%
The percentage of commercialized patents for the company named BlackBerry, is: 11.44%
The percentage of commercialized patents for the company named Nike, is: 0.06%
The percentage of commercialized patents for the company named Hyundai Motor Company, is: 0.00%
The percentage of commercialized patents for the company named VMware, is: 100.00%
The percentage of commercialized patents for the company named HOYA Surgical Optics, is: 0.68%
The percentage of commercialized patents for the company named Cree, is: 1.69%
The percentage of commercialized patents for the company named Citrix Systems, is: 100.00%
The percentage of commercialized patents for the company named HTC, is: 8.17%
The percentage of commercialized patents for the company named Johnson & Johnson Vision Care, Inc., is: 33.73%
The percentage of commercialized patents for the company named Immersion Corporation, is: 4.39%
The percentage of commer

***

## Exploring bias towards the **country** in which the patent has been commercialized

In [18]:
for i in df[df['countries_in_family'].apply(lambda x : 'EP' in x)]['countries_in_family']: 
    print(i)

['AU' 'EP' 'CA' 'US']
['JP' 'US' 'ES' 'HU' 'CN' 'EP' 'PL' 'DK']
['EP' 'CA' 'AU' 'US']
['US' 'DK' 'ES' 'PL' 'CN' 'JP' 'HU' 'EP']
['JP' 'PL' 'CN' 'DK' 'HU' 'ES' 'EP' 'US']
['US' 'CN' 'EP' 'JP']
['CN' 'EP' 'US' 'JP' 'HU' 'DK' 'ES' 'PL']
['US' 'CA' 'EP' 'AU']
['AU' 'CA' 'US' 'EP']
['CA' 'AU' 'EP' 'US']
['EP' 'CN' 'JP' 'US']
['CA' 'EP' 'US' 'AU']
['CA' 'JP' 'EP' 'CN' 'US' 'BR']
['JP' 'EP' 'CN' 'US']
['JP' 'EP' 'CA' 'CN' 'AU' 'US']
['RU' 'US' 'EP' 'CN']
['AU' 'CN' 'US' 'JP' 'CA' 'EP']
['JP' 'CN' 'BR' 'CA' 'EP' 'US']
['EP' 'CN' 'JP' 'US']
['EP' 'CN' 'US' 'RU']
['JP' 'EP' 'CN' 'US']
['JP' 'US' 'EP' 'CN']
['JP' 'CN' 'US' 'DE' 'AT' 'EP']
['CN' 'EP' 'US' 'JP']
['US' 'CA' 'EP' 'BR' 'CN' 'JP']
['US' 'JP' 'CN' 'AT' 'EP' 'DE']
['CN' 'EP' 'US']
['CN' 'US' 'CA' 'EP' 'JP']
['CN' 'EP' 'US']
['AT' 'US' 'CN' 'EP' 'DE' 'JP']
['EP' 'CN' 'US' 'RU']
['JP' 'CN' 'CA' 'EP' 'US' 'BR']
['EP' 'CN' 'JP' 'US']
['JP' 'CN' 'EP' 'US']
['EP' 'CN' 'US' 'RU']
['EP' 'US' 'CN']
['RU' 'EP' 'CN' 'US']
['US' 'EP' 'CN']
['EP' 'US

In [19]:
# Count occurrences of countries in the array named 'countries'

total_rows = df.shape[0]

countries = ['US', 'EP', 'CA', 'GB']

for country in countries:
    country_occurrences = df['countries_in_family'].str.count(country).sum()

    # Calculate percentage
    country_percentage = (country_occurrences / total_rows) * 100
    print(f"The percentage of {country} in the 'countries_in_family' column is: {country_percentage:.2f}%")


The percentage of US in the 'countries_in_family' column is: 100.00%
The percentage of EP in the 'countries_in_family' column is: 45.09%
The percentage of CA in the 'countries_in_family' column is: 23.95%
The percentage of GB in the 'countries_in_family' column is: 2.05%
