In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np 
import json

In [3]:
# pick full or smaller version of dataset
df = pd.read_csv('data/modelready_220423.csv')
# df = pd.read_csv('data/ten_percent.csv')

In [4]:
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
print(f'tot columns = {len(df.columns)}, numeric type columns = {len(df.select_dtypes(include=numerics).columns)}' ) # not too many non-numeric columns
df.select_dtypes(include = ['object']).head(1)  # print non numeri columns

tot columns = 772, numeric type columns = 765


Unnamed: 0,publication_number,company_name,countries_in_family,publn_nr,primary_cpc,abstract,description_text
0,US-8623043-B1,"Entellus Medical, Inc.",['AU' 'EP' 'CA' 'US'],8623043,A61M29/02,A method of treating a constricted sinus passa...,RELATED APPLICATIONS \n This Application i...


In [4]:
# extract unique countries in the df
unique_values = set()
df['countries_in_family'].apply(lambda x: unique_values.update(x.strip("[]").replace("'", "").split())) 

# Create new columns for each unique value
for value in unique_values:
    # each country has a column (1 if the patent belong to the country 0 otherwise)
    df[value] = df['countries_in_family'].apply(lambda x: 1 if value in x else 0)


In [5]:
# encode company names
df['company_name_encoded'] = df.company_name.astype('category').cat.codes  # encode companies

# remove non-numeric columns
df_columns_dropped = df.drop(['publication_number', 'company_name', 'countries_in_family', 'publn_nr',
       'primary_cpc', 'abstract', 'description_text'], axis = 1)

# dropping these columns as they're higly correlated with commercialized
df_columns_dropped = df_columns_dropped.drop(['f0_', 'centrality', 'similarity'], axis = 1)

In [6]:
print(f'missing values = {df_columns_dropped.isna().sum().sum()} ')# some missin values
df_no_missing = df_columns_dropped.fillna(df_columns_dropped.mean()).copy()
print(f'missing values after filling= {df_no_missing.isna().sum().sum()} ')

# extracting what we'll try to predict
y = df_no_missing['commercialized']
df_no_missing.drop('commercialized', axis= 1, inplace=True)

missing values = 126935 
missing values after filling= 0 


In [7]:
min_eq_max = df_no_missing.columns[df_no_missing.min() == df_no_missing.max()].to_list()
print(f'column with all same values: {min_eq_max}')
df_clean = df_no_missing.drop(min_eq_max, axis=1)

column with all same values: ['dummy_country_US', 'US']


In [8]:
X_train, X_test, y_train, y_test = train_test_split(df_clean, y, test_size=0.20, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train_scaled, y_train)
clf.score(X_test_scaled, y_test)  # accuracy

0.8764009471191792

In [17]:
(df.f0_ == y).sum(), len(df)

(63349, 63349)