In [11]:
import pandas as pd
import glob

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import numpy as np

In [12]:
dtypes = {
    "year": int,
    "country": str,
    "doc_number": str  # Treat doc_number as a string initially to avoid mixed type issues
}

# Load CSV files with specified dtypes
csv_files = glob.glob("../../raw_data/scrape_data/patents_foster_*.csv")
dfs = [pd.read_csv(file, dtype=dtypes) for file in csv_files]
df_all = pd.concat(dfs, ignore_index=True)

print(f"Loaded {len(df_all)} rows from {len(csv_files)} files.")
df_all = df_all[df_all["doc_number"].astype(str).str.isdigit()]


Loaded 3659474 rows from 6 files.


In [13]:
#Count patents per year from your collected data
yearly_counts = df_all['year'].value_counts().sort_index()

# Create a full range of years
full_years = pd.DataFrame({'year': range(1700, 2025)})

# Merge the counts with the full year range
df_year_summary = full_years.merge(
    yearly_counts.rename('patents_collected'), 
    how='left', 
    left_on='year', 
    right_index=True
)

df_year_summary['patents_collected'] = df_year_summary.apply(
    lambda row: 0 if row['year'] <= 1928 and pd.isna(row['patents_collected']) else row['patents_collected'],
    axis=1
)

df_year_summary['is_actual'] = df_year_summary.apply(
    lambda row: True if row['year'] <= 1928 or pd.notna(row['patents_collected']) else False,
    axis=1
)

df_year_summary['patents_predicted'] = pd.NA

display(df_year_summary.tail(10))


Unnamed: 0,year,patents_collected,is_actual,patents_predicted
315,2015,,False,
316,2016,,False,
317,2017,,False,
318,2018,,False,
319,2019,,False,
320,2020,203389.0,True,
321,2021,,False,
322,2022,,False,
323,2023,,False,
324,2024,,False,


In [14]:
train_data = df_year_summary[df_year_summary['is_actual'] == True].copy()

X_train = train_data['year'].values.reshape(-1, 1)
y_train = train_data['patents_collected'].values

display(train_data)

Unnamed: 0,year,patents_collected,is_actual,patents_predicted
0,1700,0.0,True,
1,1701,0.0,True,
2,1702,0.0,True,
3,1703,0.0,True,
4,1704,0.0,True,
...,...,...,...,...
225,1925,121985.0,True,
226,1926,119905.0,True,
227,1927,119003.0,True,
228,1928,123749.0,True,


In [15]:

# PolynomialFeatures (degree=2 for quadratic regression)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_train)

# Train the LinearRegression model
model = LinearRegression()
model.fit(X_poly, y_train)

# Make predictions for all years (1700-2024)
all_years = df_year_summary['year'].values.reshape(-1, 1)
X_all_poly = poly.transform(all_years)
df_year_summary['patents_predicted'] = model.predict(X_all_poly)

# Update predictions only for missing patent counts (NaN)
df_year_summary.loc[df_year_summary['is_actual'] == False, 'patents_predicted'] = model.predict(
    poly.transform(df_year_summary.loc[df_year_summary['is_actual'] == False, 'year'].values.reshape(-1, 1))
)


In [16]:
display(df_year_summary)


Unnamed: 0,year,patents_collected,is_actual,patents_predicted
0,1700,0.0,True,15403.140578
1,1701,0.0,True,14759.560577
2,1702,0.0,True,14124.410605
3,1703,0.0,True,13497.690664
4,1704,0.0,True,12879.400752
...,...,...,...,...
320,2020,203389.0,True,239726.260534
321,2021,,False,241780.290065
322,2022,,False,243842.749626
323,2023,,False,245913.639216


In [17]:
# Total number of patents to sample from: actual + predicted
df_year_summary['patents_total'] = df_year_summary.apply(
    lambda row: row['patents_collected'] if row['is_actual'] 
                else row.get('patents_predicted', 0),
    axis=1
)
actual_patents = df_year_summary['patents_collected'].sum()
total_patents = df_year_summary['patents_total'].sum()

display(df_year_summary.tail(10))
print(f"total number of actaul patents: {actual_patents}")
print(f"Total number of patents to sample from: {total_patents}")
print(f"Percentage of patents collected: {actual_patents / total_patents * 100:.2f}%")

Unnamed: 0,year,patents_collected,is_actual,patents_predicted,patents_total
315,2015,,False,229582.563327,229582.563327
316,2016,,False,231594.442709,231594.442709
317,2017,,False,233614.75212,233614.75212
318,2018,,False,235643.491562,235643.491562
319,2019,,False,237680.661033,237680.661033
320,2020,203389.0,True,239726.260534,203389.0
321,2021,,False,241780.290065,241780.290065
322,2022,,False,243842.749626,243842.749626
323,2023,,False,245913.639216,245913.639216
324,2024,,False,247992.958836,247992.958836


total number of actaul patents: 3618039.0
Total number of patents to sample from: 18908421.97568725
Percentage of patents collected: 19.13%


In [23]:
sample_rate = 0.01  # 1% base sample rate
df_year_summary['sample_amount'] = df_year_summary['patents_total'].apply(
    lambda total: int(total) if total < 100 else max(100, int(np.ceil(total * sample_rate)))
)
display(df_year_summary.iloc[225:235])
#display(df_year_summary.head(10))
print(f"Total amount of samples: {df_year_summary['sample_amount'].sum()}")
print(f"samples not collected: {df_year_summary.loc[df_year_summary['is_actual'] == False, 'sample_amount'].sum()}")

Unnamed: 0,year,patents_collected,is_actual,patents_predicted,patents_total,sample_amount
225,1925,121985.0,True,83034.390937,121985.0,1220
226,1926,119905.0,True,84287.567638,119905.0,1200
227,1927,119003.0,True,85549.174369,119003.0,1191
228,1928,123749.0,True,86819.211129,123749.0,1238
229,1929,,False,88097.67792,88097.67792,881
230,1930,,False,89384.57474,89384.57474,894
231,1931,,False,90679.90159,90679.90159,907
232,1932,,False,91983.658469,91983.658469,920
233,1933,,False,93295.845379,93295.845379,933
234,1934,,False,94616.462318,94616.462318,947


Total amount of samples: 191733
samples not collected: 152946


In [29]:
uncollected_rows = df_year_summary[df_year_summary['is_actual'] == False]

uncollected_years_amount = uncollected_rows[['year', 'sample_amount']]

uncollected_years_amount.to_csv('uncollected_years_amount.csv', index=False)


In [21]:
sampled_dfs = []  # Collect sampled data here

for _, row in df_year_summary[df_year_summary['is_actual']].iterrows():
    year = row['year']
    n_samples = int(row['sample_amount'])

    # Filter df_all for the current year
    year_patents = df_all[df_all['year'] == year]

    # If there are fewer patents than we want to sample (edge case protection)
    n_samples = min(n_samples, len(year_patents))

    # Sample randomly without replacement
    if n_samples > 0:
        sampled = year_patents.sample(n=n_samples, replace=False, random_state=42)
        sampled_dfs.append(sampled)

# Combine all samples into a single DataFrame
sampled_df = pd.concat(sampled_dfs, ignore_index=True)

# Save the sampled data to a CSV file
sampled_df.to_csv('sampled_patents.csv', index=False)