Importing the dataset from 2 CSV

In [2]:
import pandas as pd
from pathlib import Path

EXPECTED_COLUMNS = [
    "patent_id",
    "patent_number",
    "grant_date",
    "grant_year",
    "title",
    "abstract",
    "num_claims",
    "patent_country",
    "assignee_type",
    "assignee_name",
    "forward_citations",
    "backward_citations",
    "cpc_section",
    "cpc_subsection",
    "citation_decile",
    "high_impact"
]

# Loading Datasets
data_folder = Path('content')
csv_paths = sorted(data_folder.glob("patent_sample_*.csv"))
print(f"Found {len(csv_paths)} files")
for path in csv_paths:
    print(f"  {path.name}")

Found 2 files
  patent_sample_000000000000.csv
  patent_sample_000000000001.csv


Modifying the dataset 

In [3]:
# Checking the numbers of columns in CSV files
for path in csv_paths:
    first_chunk = pd.read_csv(path, header=None, nrows=1, engine="python")
    column_count = first_chunk.shape[1]
    print(f"{path.name}: {column_count} columns (expected {len(EXPECTED_COLUMNS)})")

print("Expected column order:")
print(EXPECTED_COLUMNS)

patent_sample_000000000000.csv: 16 columns (expected 16)
patent_sample_000000000001.csv: 16 columns (expected 16)
Expected column order:
['patent_id', 'patent_number', 'grant_date', 'grant_year', 'title', 'abstract', 'num_claims', 'patent_country', 'assignee_type', 'assignee_name', 'forward_citations', 'backward_citations', 'cpc_section', 'cpc_subsection', 'citation_decile', 'high_impact']


In [4]:
# Load CSV files with the expected schema
frames = []
for path in csv_paths:
    df = pd.read_csv(
        path,
        header=None,
        names=EXPECTED_COLUMNS,
        usecols=range(len(EXPECTED_COLUMNS)),
        engine="python"
    )
    frames.append(df)

full_df = pd.concat(frames, ignore_index=True)
print(full_df.shape)
display(full_df.head())

(200000, 16)


Unnamed: 0,patent_id,patent_number,grant_date,grant_year,title,abstract,num_claims,patent_country,assignee_type,assignee_name,forward_citations,backward_citations,cpc_section,cpc_subsection,citation_decile,high_impact
0,6017303,6017303,2000-01-25,2000,Underpants with a stamina reinforcing mechanis...,An underpants with a stamina reinforcing mech...,13,US,,,0,5,A,A61,10,0
1,6017541,6017541,2000-01-25,2000,Immunogens for the production of cocaine-hydro...,Methods are described for the rapid synthesis...,5,US,,,0,5,A,A61,10,0
2,6022706,6022706,2000-02-08,2000,Div1b,The invention provides Divlb polypeptides and...,23,US,,SMITHKLINE BEECHAM (CORK) LIMITED,0,1,A,A61,10,0
3,6022710,6022710,2000-02-08,2000,Nucleic acid encoding greA from Streptococcus ...,greA polypeptides and DNA (RNA) encoding such...,16,US,,SMITHKLINE BEECHAM (CORK) LIMITED,0,0,A,A61,10,0
4,6022978,6022978,2000-02-08,2000,Benzimidazole derivatives,Benzimidazole Derivatives having the formulae...,13,US,2.0,Pfizer Inc.,0,1,A,A61,10,0


In [5]:
print(full_df.shape)
full_df.head()

(200000, 16)


Unnamed: 0,patent_id,patent_number,grant_date,grant_year,title,abstract,num_claims,patent_country,assignee_type,assignee_name,forward_citations,backward_citations,cpc_section,cpc_subsection,citation_decile,high_impact
0,6017303,6017303,2000-01-25,2000,Underpants with a stamina reinforcing mechanis...,An underpants with a stamina reinforcing mech...,13,US,,,0,5,A,A61,10,0
1,6017541,6017541,2000-01-25,2000,Immunogens for the production of cocaine-hydro...,Methods are described for the rapid synthesis...,5,US,,,0,5,A,A61,10,0
2,6022706,6022706,2000-02-08,2000,Div1b,The invention provides Divlb polypeptides and...,23,US,,SMITHKLINE BEECHAM (CORK) LIMITED,0,1,A,A61,10,0
3,6022710,6022710,2000-02-08,2000,Nucleic acid encoding greA from Streptococcus ...,greA polypeptides and DNA (RNA) encoding such...,16,US,,SMITHKLINE BEECHAM (CORK) LIMITED,0,0,A,A61,10,0
4,6022978,6022978,2000-02-08,2000,Benzimidazole derivatives,Benzimidazole Derivatives having the formulae...,13,US,2.0,Pfizer Inc.,0,1,A,A61,10,0


Details of Dataset

In [6]:
print(full_df.shape)
full_df.info()
display(full_df.describe(include="all").transpose())
display(full_df.isna().sum().sort_values(ascending=False))

(200000, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   patent_id           200000 non-null  int64  
 1   patent_number       200000 non-null  int64  
 2   grant_date          200000 non-null  object 
 3   grant_year          200000 non-null  int64  
 4   title               200000 non-null  object 
 5   abstract            199985 non-null  object 
 6   num_claims          200000 non-null  int64  
 7   patent_country      200000 non-null  object 
 8   assignee_type       186048 non-null  float64
 9   assignee_name       187756 non-null  object 
 10  forward_citations   200000 non-null  int64  
 11  backward_citations  200000 non-null  int64  
 12  cpc_section         200000 non-null  object 
 13  cpc_subsection      200000 non-null  object 
 14  citation_decile     200000 non-null  int64  
 15  high_impact         2

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
patent_id,200000.0,,,,7855937.98644,919463.739742,6009574.0,7107124.0,8002465.0,8649876.0,9225365.0
patent_number,200000.0,,,,7855937.98644,919463.739742,6009574.0,7107124.0,8002465.0,8649876.0,9225365.0
grant_date,200000.0,835.0,2014-06-03,742.0,,,,,,,
grant_year,200000.0,,,,2009.733315,4.59878,2000.0,2006.0,2011.0,2014.0,2015.0
title,200000.0,76918.0,Modulators of ATP-binding cassette transporters,626.0,,,,,,,
abstract,199985.0,81970.0,Disclosed in certain embodiments is a controll...,607.0,,,,,,,
num_claims,200000.0,,,,21.98565,20.781183,1.0,11.0,18.0,26.0,505.0
patent_country,200000.0,1.0,US,200000.0,,,,,,,
assignee_type,186048.0,,,,2.310925,0.593913,0.0,2.0,2.0,3.0,15.0
assignee_name,187756.0,16474.0,"ETHICON ENDO-SURGERY, INC",4745.0,,,,,,,


assignee_type         13952
assignee_name         12244
abstract                 15
patent_id                 0
grant_year                0
title                     0
grant_date                0
patent_number             0
patent_country            0
num_claims                0
forward_citations         0
backward_citations        0
cpc_section               0
cpc_subsection            0
citation_decile           0
high_impact               0
dtype: int64

### Data Cleaning 

In [7]:
# Standardizing core data types
full_df["grant_date"] = pd.to_datetime(full_df["grant_date"], errors="coerce")
full_df["grant_year"] = pd.to_numeric(full_df["grant_year"], errors="coerce").astype("Int64")

numeric_columns = [
    "num_claims",
    "assignee_type",
    "forward_citations",
    "backward_citations",
    "citation_decile",
    "high_impact",
]

for col in numeric_columns:
    full_df[col] = pd.to_numeric(full_df[col], errors="coerce")

full_df[numeric_columns + ["grant_year"]].dtypes

num_claims              int64
assignee_type         float64
forward_citations       int64
backward_citations      int64
citation_decile         int64
high_impact             int64
grant_year              Int64
dtype: object

In [8]:
# Filling the missing data
full_df["num_claims"] = full_df["num_claims"].fillna(full_df["num_claims"].median()).astype("Int64")
assignee_type_median = full_df["assignee_type"].median()
full_df["assignee_type"] = (
    full_df["assignee_type"]
    .fillna(assignee_type_median)
    .round()
    .astype("Int64")
)

for col in ["forward_citations", "backward_citations", "citation_decile", "high_impact"]:
    full_df[col] = full_df[col].fillna(0).astype("Int64")

full_df["abstract"] = full_df["abstract"].fillna("No abstract available")
full_df["assignee_name"] = full_df["assignee_name"].fillna("Unknown Assignee")
full_df["title"] = full_df["title"].fillna("Untitled Patent")

full_df.isna().sum().sort_values(ascending=False).head()

patent_id        0
patent_number    0
grant_date       0
grant_year       0
title            0
dtype: int64

In [9]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   patent_id           200000 non-null  int64         
 1   patent_number       200000 non-null  int64         
 2   grant_date          200000 non-null  datetime64[ns]
 3   grant_year          200000 non-null  Int64         
 4   title               200000 non-null  object        
 5   abstract            200000 non-null  object        
 6   num_claims          200000 non-null  Int64         
 7   patent_country      200000 non-null  object        
 8   assignee_type       200000 non-null  Int64         
 9   assignee_name       200000 non-null  object        
 10  forward_citations   200000 non-null  Int64         
 11  backward_citations  200000 non-null  Int64         
 12  cpc_section         200000 non-null  object        
 13  cpc_subsection      200000 no

In [10]:
full_df.duplicated().sum()

np.int64(100505)

Dataset contains 50% duplicate data 😅

In [11]:
full_df = full_df.drop_duplicates().reset_index(drop=True)

In [12]:
full_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99495 entries, 0 to 99494
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   patent_id           99495 non-null  int64         
 1   patent_number       99495 non-null  int64         
 2   grant_date          99495 non-null  datetime64[ns]
 3   grant_year          99495 non-null  Int64         
 4   title               99495 non-null  object        
 5   abstract            99495 non-null  object        
 6   num_claims          99495 non-null  Int64         
 7   patent_country      99495 non-null  object        
 8   assignee_type       99495 non-null  Int64         
 9   assignee_name       99495 non-null  object        
 10  forward_citations   99495 non-null  Int64         
 11  backward_citations  99495 non-null  Int64         
 12  cpc_section         99495 non-null  object        
 13  cpc_subsection      99495 non-null  object    

In [13]:
# Final schema check
constant_candidates = ["patent_country", "cpc_section", "cpc_subsection"]
dropped_columns = [col for col in constant_candidates if full_df[col].nunique(dropna=False) <= 1]
if dropped_columns:
    full_df = full_df.drop(columns=dropped_columns)
    print(f"Dropped nearly-constant columns: {dropped_columns}")
else:
    print("No constant columns dropped.")

for col in ["forward_citations", "backward_citations", "citation_decile", "num_claims", "assignee_type", "high_impact"]:
    full_df[col] = full_df[col].astype("Int64")

summary_cols = [
    "patent_id",
    "patent_number",
    "grant_date",
    "grant_year",
    "title",
    "num_claims",
    "assignee_type",
    "assignee_name",
    "forward_citations",
    "backward_citations",
    "citation_decile",
    "high_impact"
]

print(full_df.shape)
display(full_df[summary_cols].head())

Dropped nearly-constant columns: ['patent_country', 'cpc_section', 'cpc_subsection']
(99495, 13)


Unnamed: 0,patent_id,patent_number,grant_date,grant_year,title,num_claims,assignee_type,assignee_name,forward_citations,backward_citations,citation_decile,high_impact
0,6017303,6017303,2000-01-25,2000,Underpants with a stamina reinforcing mechanis...,13,2,Unknown Assignee,0,5,10,0
1,6017541,6017541,2000-01-25,2000,Immunogens for the production of cocaine-hydro...,5,2,Unknown Assignee,0,5,10,0
2,6022706,6022706,2000-02-08,2000,Div1b,23,2,SMITHKLINE BEECHAM (CORK) LIMITED,0,1,10,0
3,6022710,6022710,2000-02-08,2000,Nucleic acid encoding greA from Streptococcus ...,16,2,SMITHKLINE BEECHAM (CORK) LIMITED,0,0,10,0
4,6022978,6022978,2000-02-08,2000,Benzimidazole derivatives,13,2,Pfizer Inc.,0,1,10,0


In [14]:
full_df['high_impact'].value_counts(normalize=True)

high_impact
0    0.807287
1    0.192713
Name: proportion, dtype: Float64

### Training the model 😁

In [15]:
from sklearn.model_selection import train_test_split

X = full_df.drop(columns=['high_impact'])
y = full_df['high_impact']

# target variable is imbalance so using stratify
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42, stratify=y)


In [16]:
# Scaling the features 
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numeric_features = ["num_claims","assignee_type","forward_citations","backward_citations","citation_decile","grant_year"]
X_train_scale = scaler.fit_transform(X_train[numeric_features])
X_test_scale = scaler.transform(X_test[numeric_features])

In [17]:
# training the logistic regression 
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=500, class_weight="balanced")
clf.fit(X_train_scale, y_train)

# Prediction 😉
y_pred = clf.predict(X_test_scale)

In [18]:
# Evalute the performance 
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[16064     0]
 [    0  3835]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     16064
         1.0       1.00      1.00      1.00      3835

    accuracy                           1.00     19899
   macro avg       1.00      1.00      1.00     19899
weighted avg       1.00      1.00      1.00     19899



### Model Overfits 

its time to find out the feature which is leaking the data 😎

In [19]:
pd.crosstab(full_df["high_impact"], full_df["citation_decile"])

citation_decile,1,2,3,4,5,6,7,8,9,10
high_impact,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,6515,7285,8008,8689,9258,9777,10307,10153,10329
1,19174,0,0,0,0,0,0,0,0,0


In [20]:
# Step 1: numeric correlation with target
numeric_cols = full_df.select_dtypes(include="number").columns.drop("high_impact")
target_numeric = full_df["high_impact"].astype(int)

numeric_corr = (
    full_df[numeric_cols]
    .corrwith(target_numeric)
    .abs()
    .sort_values(ascending=False)
)
print(numeric_corr.head(10))

citation_decile       0.683996
forward_citations     0.588697
backward_citations    0.248942
num_claims            0.151638
assignee_type         0.147176
patent_id             0.045121
patent_number         0.045121
grant_year            0.038903
dtype: float64


In [21]:
# Step 2: columns whose values map to only one class (small-cardinality leak)
leaking_cols = []
for col in full_df.columns:
    if col == "high_impact":
        continue
    class_per_value = full_df.groupby(col)["high_impact"].nunique(dropna=False)
    share_single_class = (class_per_value == 1).mean()
    if share_single_class == 1.0 and class_per_value.size <= 30:
        leaking_cols.append(col)

print("Perfect leaks:", leaking_cols)

Perfect leaks: ['citation_decile']


Got the criminals😎- citation_decile is a perfect leak (each value maps to a single class), and forward_citations/backward_citations generate citation_decile

In [22]:
# Training data with excluded columns which are leaking data 

leak_cols = ["citation_decile", "forward_citations", "backward_citations"]
X = full_df.drop(columns=["high_impact"] + leak_cols)
y = full_df["high_impact"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

numeric_features = ["num_claims", "assignee_type", "grant_year"]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_features])
X_test_scaled = scaler.transform(X_test[numeric_features])

clf = LogisticRegression(max_iter=500, class_weight="balanced")
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[8522 7542]
 [1092 2743]]
              precision    recall  f1-score   support

         0.0       0.89      0.53      0.66     16064
         1.0       0.27      0.72      0.39      3835

    accuracy                           0.57     19899
   macro avg       0.58      0.62      0.53     19899
weighted avg       0.77      0.57      0.61     19899



Basic model is ready!🥳
Lets, increase the F1-score 

In [23]:
# Lets try RandomForest 
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_leaf=5,
    class_weight="balanced_subsample",
    random_state=42,
    n_jobs=-1
)
clf.fit(X_train[numeric_features], y_train)
y_pred = clf.predict(X_test[numeric_features])

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[9179 6885]
 [1324 2511]]
              precision    recall  f1-score   support

         0.0       0.87      0.57      0.69     16064
         1.0       0.27      0.65      0.38      3835

    accuracy                           0.59     19899
   macro avg       0.57      0.61      0.54     19899
weighted avg       0.76      0.59      0.63     19899



As we have removed 3 columns out of 7 columns, we are getting low score as our model is training on just 4 features. 

In [25]:
# Increasing the columns 
drop_cols = ["high_impact", "patent_id", "patent_number", "grant_date"] + leak_cols
X = full_df.drop(columns=drop_cols)
y = full_df["high_impact"]

# Grouping the features  
text_cols = ["title", "abstract"]
num_cols = ["num_claims", "assignee_type", "grant_year"]
cat_cols = [c for c in X.select_dtypes(include="object").columns if c not in text_cols]

In [None]:
# Data Preprocessing -
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("title_tfidf", TfidfVectorizer(max_features=2000, ngram_range=(1, 2)), "title"),
        ("abstract_tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), "abstract"),
    ],
    remainder="drop",
)


In [None]:
# Re-training the model by doing model preprocessing 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

model = Pipeline(
    steps=[
        ("prep", preprocessor),
        ("clf", LogisticRegression(max_iter=2000, class_weight="balanced", solver="saga")),
    ]
)
model.fit(X_train, y_train)

from sklearn.metrics import classification_report, confusion_matrix
y_pred = model.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[13314  2750]
 [  732  3103]]
              precision    recall  f1-score   support

         0.0       0.95      0.83      0.88     16064
         1.0       0.53      0.81      0.64      3835

    accuracy                           0.83     19899
   macro avg       0.74      0.82      0.76     19899
weighted avg       0.87      0.83      0.84     19899



Model is ready with weighted-average F1 score - 0.84

In [None]:
# Lets, Try catboost to increase the F1 score further
from catboost import CatBoostClassifier, Pool

cat_features = [c for c in X.columns if X[c].dtype == "object"]
train_pool = Pool(X_train, y_train, cat_features=cat_features, text_features=["title", "abstract"])
test_pool = Pool(X_test, y_test, cat_features=cat_features, text_features=["title", "abstract"])

model = CatBoostClassifier(
    iterations=2000,
    depth=8,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="F1",
    auto_class_weights="Balanced",
    random_state=42,
    verbose=200
)

model.fit(train_pool, eval_set=test_pool, use_best_model=True)
y_pred = model.predict(test_pool)
print(classification_report(y_test, y_pred))

0:	learn: 0.5888543	test: 0.5832446	best: 0.5832446 (0)	total: 610ms	remaining: 20m 18s
200:	learn: 0.7949097	test: 0.8035191	best: 0.8037797 (198)	total: 1m 22s	remaining: 12m 17s
400:	learn: 0.8228985	test: 0.8151852	best: 0.8151852 (400)	total: 2m 44s	remaining: 10m 55s
600:	learn: 0.8423947	test: 0.8210383	best: 0.8210383 (600)	total: 5m 3s	remaining: 11m 46s
800:	learn: 0.8595884	test: 0.8226519	best: 0.8226519 (796)	total: 6m 19s	remaining: 9m 28s
1000:	learn: 0.8746497	test: 0.8235581	best: 0.8248435 (947)	total: 8m 10s	remaining: 8m 9s
1200:	learn: 0.8868105	test: 0.8267002	best: 0.8271634 (1178)	total: 9m 33s	remaining: 6m 21s
1400:	learn: 0.8989659	test: 0.8269035	best: 0.8279888 (1361)	total: 10m 49s	remaining: 4m 37s
1600:	learn: 0.9084333	test: 0.8273130	best: 0.8279888 (1361)	total: 12m 7s	remaining: 3m 1s
1800:	learn: 0.9159242	test: 0.8262827	best: 0.8280221 (1635)	total: 13m 27s	remaining: 1m 29s
1999:	learn: 0.9238667	test: 0.8263487	best: 0.8280221 (1635)	total: 14m 

### We have achieved Weighted F1 = 0.85 🥳. 
