In [48]:
import pandas as pd
import glob

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
import numpy as np

In [49]:
dtypes = {
    "year": int,
    "country": str,
    "doc_number": str  # Treat doc_number as a string initially to avoid mixed type issues
}

# Load CSV files with specified dtypes
csv_files = glob.glob("../../raw_data/scrape_data/patents_foster_*.csv")
dfs = [pd.read_csv(file, dtype=dtypes) for file in csv_files]
df_all = pd.concat(dfs, ignore_index=True)

print(f"Loaded {len(df_all)} rows from {len(csv_files)} files.")


Loaded 3416996 rows from 5 files.


In [50]:
#Count patents per year from your collected data
yearly_counts = df_all['year'].value_counts().sort_index()

# Create a full range of years
full_years = pd.DataFrame({'year': range(1700, 2025)})

# Merge the counts with the full year range
df_year_summary = full_years.merge(
    yearly_counts.rename('patents_collected'), 
    how='left', 
    left_on='year', 
    right_index=True
)

# Add is_actual and patents_predicted columns
df_year_summary['is_actual'] = df_year_summary['patents_collected'].notna()
df_year_summary['is_actual'] = df_year_summary['year'] <= 1928

mask_pre1928 = (df_year_summary['year'] <= 1928)
df_year_summary.loc[mask_pre1928 & df_year_summary['patents_collected'].isna(), 'patents_collected'] = 0

df_year_summary['patents_predicted'] = pd.NA



display(df_year_summary.sample(10))


Unnamed: 0,year,patents_collected,is_actual,patents_predicted
42,1742,0.0,True,
33,1733,0.0,True,
319,2019,,False,
216,1916,72447.0,True,
285,1985,,False,
141,1841,509.0,True,
298,1998,,False,
17,1717,0.0,True,
121,1821,0.0,True,
4,1704,0.0,True,


In [51]:
print(df_year_summary.dtypes)

year                   int64
patents_collected    float64
is_actual               bool
patents_predicted     object
dtype: object


In [53]:
# Prepare the training data (years before 1928 with is_actual=True)
train_data = df_year_summary[df_year_summary['is_actual'] == True]  # Filter rows before 1928

X_train = train_data['year'].values.reshape(-1, 1)
y_train = train_data['patents_collected'].values

# PolynomialFeatures (degree=2 for quadratic regression)
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_train)

# Train the LinearRegression model
model = LinearRegression()
model.fit(X_poly, y_train)

# Make predictions for all years (1700-2024)
all_years = df_year_summary['year'].values.reshape(-1, 1)
X_all_poly = poly.transform(all_years)
df_year_summary['patents_predicted'] = model.predict(X_all_poly)

# Update predictions only for missing patent counts (NaN) after 1928
df_year_summary.loc[df_year_summary['is_actual'] == False, 'patents_predicted'] = model.predict(
    poly.transform(df_year_summary.loc[df_year_summary['is_actual'] == False, 'year'].values.reshape(-1, 1))
)


In [54]:
print(df_year_summary)


     year  patents_collected  is_actual  patents_predicted
0    1700                0.0       True       18144.638628
1    1701                0.0       True       17417.889092
2    1702                0.0       True       16700.370509
3    1703                0.0       True       15992.082877
4    1704                0.0       True       15293.026196
..    ...                ...        ...                ...
320  2020                NaN      False      256732.555793
321  2021                NaN      False      258959.710762
322  2022                NaN      False      261196.096682
323  2023                NaN      False      263441.713555
324  2024                NaN      False      265696.561378

[325 rows x 4 columns]
