In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np


df = pd.read_csv("rezume_full_merged.csv", sep=';')

df = df.drop(columns=['id_cv'])
df = df.head(10_000)

cat_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['number']).columns.drop('response_type_const')

for col in cat_cols:
    df[col] = df[col].fillna('unknown')

for col in num_cols:
    df[col] = df[col].fillna(df[col].median())


X = df.drop(columns=['response_type_const'])
y = df['response_type_const']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'
)

X_processed = preprocessor.fit_transform(X)


scaler = StandardScaler(with_mean=False)
X_scaled = scaler.fit_transform(X_processed)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

model = RandomForestRegressor(
    n_estimators=10,
    max_depth=5,
    min_samples_split=5,
    n_jobs=-1,
    random_state=42
)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"R2: {r2_score(y_test, y_pred):.3f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.3f}")