In [45]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import RandomForestRegressor
import pickle

In [21]:
train_df = pd.read_csv('./Train.csv')
test_df = pd.read_csv('./Test.csv')

In [22]:
if 'Place_ID' in train_df.columns and 'Date' in train_df.columns:
    train_df = train_df.drop(['Place_ID', 'Date'], axis=1)

In [23]:
if 'Place_ID X Date' in train_df.columns:
    train_df[['Place_ID', 'Date']] = train_df['Place_ID X Date'].str.split(' X ', expand=True)
    idx = train_df.columns.get_loc('Place_ID X Date')
    train_df = train_df.drop('Place_ID X Date', axis=1)
    train_df.insert(idx, 'Date', train_df.pop('Date'))
    train_df.insert(idx, 'Place_ID', train_df.pop('Place_ID'))
    train_df['Date'] = pd.to_datetime(train_df['Date'])
    train_df['Date'] = train_df['Date'].dt.strftime('%m/%d/%y')

In [24]:
place_groups = train_df.groupby('Place_ID')
numeric_cols = train_df.select_dtypes(include=np.number).columns
train_df[numeric_cols] = place_groups[numeric_cols].transform(lambda x: x.fillna(x.median()))
train_df = train_df.fillna(method='ffill')

In [25]:
X_train = train_df.drop(['Place_ID', 'Date', 'target', 'target_min', 'target_max', 'target_variance', 'target_count'], axis=1)
y_train = train_df['target']

In [26]:
selector = SelectKBest(f_regression, k='all')
X_train_selected = selector.fit_transform(X_train, y_train)
mask = selector.get_support()
X_train_selected_names = X_train.columns[mask]

In [27]:
X_trains, X_tests, y_trains, y_tests = train_test_split(X_train_selected, y_train, test_size=0.2, random_state=42)

In [28]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_trains)

In [30]:
place_groups = test_df.groupby('Place_ID')
numeric_cols = test_df.select_dtypes(include=np.number).columns
test_df[numeric_cols] = place_groups[numeric_cols].transform(lambda x: x.fillna(x.median()))
test_df = test_df.fillna(method='ffill')

In [31]:
X_test = test_df.drop(['Place_ID X Date', 'Place_ID', 'Date'], axis=1)


In [32]:
X_test_selected = selector.transform(X_tests)
X_test_scaled = scaler.transform(X_test_selected)



In [33]:
rf = RandomForestRegressor(n_estimators=500, random_state=123)
rf.fit(X_train_scaled, y_trains)
y_pred = rf.predict(X_test_scaled)

In [46]:
# with open('rf_model.pkl', 'wb') as file:
#     pickle.dump(rf, file)

In [42]:
mse = mean_squared_error(y_tests, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_tests, y_pred)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R^2 Score:", r2)

Mean Squared Error: 770.2366956261453
Root Mean Squared Error: 27.75313848245177
R^2 Score: 0.6499088413635425


In [39]:
X_valid = selector.transform(X_test)
X_valid_scaled = scaler.transform(X_valid)

In [40]:
test_pred = rf.predict(X_valid_scaled)

In [41]:
output_df = pd.DataFrame({
    'Place_ID X Date': test_df['Place_ID X Date'],
    'target': test_pred
})
output_df.to_csv('predictions.csv', index=False)