In [None]:
import pandas as pd
from classical_ml_pipeline import *
import numpy as np
import sklearn
from datetime import date, timedelta

## Preprocessing

In [None]:
def encode_date_for_regression(curr_date: pd.Timestamp, min_date: pd.Timestamp): 
    return ((curr_date - min_date).days)

def decode_date_for_regression(encoded_date: int, min_date: pd.Timestamp):
    return date.fromisoformat(min_date) + timedelta(days=encoded_date)

In [2]:
df_dates = pd.read_pickle('data.pkl')
df_dates = df_dates.rename(columns={'Gold published date' : 'gold_date'})

In [3]:
df_dates = df_dates.dropna(subset=['gold_date']).reset_index().rename(columns = {'index' : 'org_index'})

In [4]:
df_dates.columns

Index(['org_index', 'doc_id', 'text version', 'published_datapolitics',
       'gold_date', 'text_emb', 'id_embedding'],
      dtype='object')

In [5]:
df_dates['gold_date'] = pd.to_datetime(df_dates['gold_date'], format="%d/%m/%Y", errors='coerce')

In [6]:
df_dates

Unnamed: 0,org_index,doc_id,text version,published_datapolitics,gold_date,text_emb,id_embedding
0,19,1564/bbd68_DELIB_13-02-2023.pdf,https://datapolitics-public.s3.gra.io.cloud.ov...,13/02/2023,2023-02-08,"[[0.3506675362586975, 0.10585138201713562, -0....","[0.016665556, -0.017514065, 0.0016669349, 0.01..."
1,20,1408/f18a3_001_-_ag_-_02_-_proces-verbal_du_co...,https://datapolitics-public.s3.gra.io.cloud.ov...,14/02/2023,2023-02-14,"[[-0.08785710483789444, 0.1865832507610321, 0....","[0.00803537, 0.0066660685, -0.0036376012, 2.84..."
2,21,1058/fb940_2023.020.pdf,https://datapolitics-public.s3.gra.io.cloud.ov...,17/02/2023,2023-02-21,"[[-0.2083577811717987, -0.17981112003326416, 0...","[-0.0026451289, 0.011896995, -0.0072226003, 0...."
3,22,6923/286725f7c8b8f01f9bf7471c9c3ebe724a69fdc4_...,https://datapolitics-public.s3.gra.io.cloud.ov...,10/02/2023,2023-02-10,"[[-1.5100464224815369e-05, 0.05117198824882507...","[0.008383702, 0.0057615563, 0.005762536, -0.01..."
4,23,1630/948cddab8040efebdb3d903ff5423d78b91411e3_...,https://datapolitics-public.s3.gra.io.cloud.ov...,28/02/2023,2023-03-10,"[[0.03747975826263428, -0.027538297697901726, ...","[0.0036663008, 0.0010919707, 0.0034487455, -0...."
...,...,...,...,...,...,...,...
339,495,2490/1bf3b3c8d457ab37c24d26b84e91ddba8673d804_...,https://datapolitics-public.s3.gra.io.cloud.ov...,15/02/2024,2024-02-15,"[[-0.058397769927978516, 0.057436149567365646,...","[-0.0026847352, -0.0034247055, -0.00063712324,..."
340,496,6238/24d8a2f8cfd1989d316a84435f308170f1ba9fcc_...,https://datapolitics-public.s3.gra.io.cloud.ov...,24/01/2024,2024-01-24,"[[-0.08785710483789444, 0.1865832507610321, 0....","[0.011372219, 0.0011042445, -0.00055537187, -0..."
341,497,6812/a18ebaf196fccbea780f909e3508d7d4cb14bf6d_...,https://datapolitics-public.s3.gra.io.cloud.ov...,09/01/2024,2024-01-09,"[[0.035555072128772736, -0.03525448963046074, ...","[-0.001806432, 0.009203039, 0.006748648, 0.007..."
342,498,6834/594b09f7dd530aa0245edaf6193cb6238cd31659_...,https://datapolitics-public.s3.gra.io.cloud.ov...,22/11/2022,2022-11-22,"[[-0.016222871840000153, 0.015525649301707745,...","[0.004428829, -0.008365971, -0.0024129075, 0.0..."


In [7]:
df_dates = df_dates.dropna(subset=['gold_date']).reset_index(drop=True)
df_dates['gold_date'].min()

Timestamp('1905-07-15 00:00:00')

In [8]:
df_dates = df_dates[~(df_dates['gold_date'] == df_dates['gold_date'].min())].reset_index(drop=True)

In [9]:
(df_dates['gold_date'].max() - df_dates['gold_date'].min()).days

4035

In [10]:
df_dates['gold_date'].min()

Timestamp('2013-03-25 00:00:00')

In [11]:
df_dates['encoded_gold_date'] = df_dates['gold_date'].apply(lambda x : encode_date_for_regression(x, df_dates['gold_date'].min()))

In [12]:
df_dates['summed_text_emb'] = df_dates['text_emb'].apply(lambda x : np.array(x).sum(axis=0))

## Linear Regression

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

In [14]:
X, y = np.array(df_dates['summed_text_emb'].to_list()), np.array(df_dates['encoded_gold_date'].to_list())
poly = PolynomialFeatures(degree=4, include_bias=False)

In [15]:
X.shape

(341, 300)

In [16]:
poly.fit_transform(X.reshape(-1, 1))

array([[ 6.61654950e-01,  4.37787273e-01,  2.89664116e-01,
         1.91657696e-01],
       [ 1.08341125e+01,  1.17377994e+02,  1.27168640e+03,
         1.37775936e+04],
       [-3.63183425e-01,  1.31902200e-01, -4.79046929e-02,
         1.73981904e-02],
       ...,
       [ 1.09701775e+01,  1.20344793e+02,  1.32020374e+03,
         1.44828693e+04],
       [-6.72679046e+00,  4.52497099e+01, -3.04385317e+02,
         2.04753625e+03],
       [ 9.24373006e+00,  8.54465454e+01,  7.89844800e+02,
         7.30111212e+03]])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [18]:
regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

In [20]:
regression_model.score(X_test,y_test)

-8230.067593653894

## Classification

In [21]:
from sklearn.neural_network import MLPClassifier

In [22]:
df_dates[['year','month','day']]= df_dates.apply(lambda row : [row.gold_date.year, row.gold_date.month, row.gold_date.day], axis=1, result_type = 'expand')

In [23]:
day_clf = MLPClassifier()
month_clf = MLPClassifier()
year_clf = MLPClassifier()

### Year

In [24]:
X = np.array(df_dates['summed_text_emb'].to_list())
y_year = np.array(df_dates['year'].to_list())
X_train, X_test, y_train, y_test = train_test_split(X, y_year, test_size=0.20)

In [25]:
year_clf.fit(X_train, y_train)



In [26]:
year_clf.score(X_test, y_test)

0.43478260869565216

### Month

In [32]:
X = np.array(df_dates['summed_text_emb'].to_list())
y_month = np.array(df_dates['month'].to_list())
X_train, X_test, y_train, y_test = train_test_split(X, y_month, test_size=0.20)

In [33]:
month_clf.fit(X_train, y_train)



In [34]:
month_clf.score(X_test, y_test)

0.17391304347826086

### Day

In [35]:
X = np.array(df_dates['summed_text_emb'].to_list())
y_day = np.array(df_dates['day'].to_list())
X_train, X_test, y_train, y_test = train_test_split(X, y_day, test_size=0.20)

In [36]:
day_clf.fit(X_train, y_train)



In [37]:
day_clf.score(X_test, y_test)

0.11594202898550725