In [1]:
# NSS AARTHI

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder

# Load data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Convert dates
train_data['date'] = pd.to_datetime(train_data['date'])
test_data['date'] = pd.to_datetime(test_data['date'])

# Fill missing values for unit_price
train_data.fillna({'unit_price': train_data['unit_price'].median()}, inplace=True)
test_data.fillna({'unit_price': test_data['unit_price'].median()}, inplace=True)

# Feature engineering
train_data['day_of_week'] = train_data['date'].dt.dayofweek
train_data['month'] = train_data['date'].dt.month
train_data['lag_1'] = train_data['unit_price'].shift(1).fillna(0)

# Label encoding for 'anarix_id'
label_encoder = LabelEncoder()
train_data['anarix_id_encoded'] = label_encoder.fit_transform(train_data['anarix_id'])
test_data['anarix_id_encoded'] = label_encoder.transform(test_data['anarix_id'])

# Prepare features and target
X = train_data[['unit_price', 'day_of_week', 'month', 'lag_1', 'anarix_id_encoded']]
y = train_data['units'].fillna(train_data['units'].median())

# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_val)
rf_mse = mean_squared_error(y_val, rf_predictions)
print(f'Random Forest MSE: {rf_mse}')

# Test data preparation
test_data['day_of_week'] = test_data['date'].dt.dayofweek
test_data['month'] = test_data['date'].dt.month
test_data['lag_1'] = test_data['unit_price'].shift(1).fillna(0)

# Use the same features as in the training set for prediction
test_features = test_data[['unit_price', 'day_of_week', 'month', 'lag_1', 'anarix_id_encoded']]

# Predicting
predictions = rf_model.predict(test_features)

# Attach predictions to the test_data for submission
test_data['TARGET'] = predictions

# Create a DataFrame for submission (assuming 'ID' is required as per the initial code)
submission = test_data[['ID', 'TARGET']]
submission.to_csv('submissions[without].csv', index=False)


Random Forest MSE: 2358.3074562152146
