In [24]:
import pandas as pd
import numpy as np

In [25]:
data = pd.read_csv("train.csv")

In [26]:
data.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [27]:
data.shape

(230130, 6)

In [28]:
data.isna().sum()

id             0
date           0
country        0
store          0
product        0
num_sold    8871
dtype: int64

In [29]:
data.columns

Index(['id', 'date', 'country', 'store', 'product', 'num_sold'], dtype='object')

In [30]:
#encoding of data (one hot)
data_encoded = pd.get_dummies(data, columns=['country', 'store', 'product'], drop_first=True)

In [31]:
# Convert the 'date' column to datetime format
data_encoded['date'] = pd.to_datetime(data_encoded['date'])

# Extract features from the 'date' column
data_encoded['year'] = data_encoded['date'].dt.year
data_encoded['month'] = data_encoded['date'].dt.month
data_encoded['day'] = data_encoded['date'].dt.day
data_encoded['day_of_week'] = data_encoded['date'].dt.dayofweek  # Monday=0, Sunday=6
data_encoded['is_weekend'] = (data_encoded['day_of_week'] >= 5).astype(int)

# Drop the original 'date' column (optional, if not needed)
data_encoded = data_encoded.drop('date', axis=1)

In [32]:
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [33]:
knn_imputer = KNNImputer(n_neighbors=5)
data_encoded['num_sold']=knn_imputer.fit_transform(data_encoded[['num_sold']])

In [34]:
data_encoded.isna().sum()

id                            0
num_sold                      0
country_Finland               0
country_Italy                 0
country_Kenya                 0
country_Norway                0
country_Singapore             0
store_Premium Sticker Mart    0
store_Stickers for Less       0
product_Kaggle                0
product_Kaggle Tiers          0
product_Kerneler              0
product_Kerneler Dark Mode    0
year                          0
month                         0
day                           0
day_of_week                   0
is_weekend                    0
dtype: int64

In [35]:
data_encoded['num_sold']

0          752.527382
1          973.000000
2          906.000000
3          423.000000
4          491.000000
             ...     
230125     466.000000
230126    2907.000000
230127    2299.000000
230128    1242.000000
230129    1622.000000
Name: num_sold, Length: 230130, dtype: float64

In [36]:
x=data_encoded.drop('num_sold',axis=1)
y=data_encoded['num_sold']

In [37]:
xtrain,xval,ytrain,yval = train_test_split(x, y, test_size=0.2, random_state=42)

In [38]:
model = RandomForestRegressor()
model.fit(xtrain,ytrain)
ypred = model.predict(xval)

In [39]:
from sklearn.metrics import root_mean_squared_error,r2_score
print(r2_score(yval,ypred))
print(root_mean_squared_error(yval,ypred))

0.9872841830756564
76.38617863138145


In [40]:
### TEST DATA TEST.CSV

In [46]:
test_data = pd.read_csv('test.csv')

#'date' column to datetime format and extract date features
test_data['date'] = pd.to_datetime(test_data['date'])
test_data['year'] = test_data['date'].dt.year
test_data['month'] = test_data['date'].dt.month
test_data['day'] = test_data['date'].dt.day
test_data['day_of_week'] = test_data['date'].dt.dayofweek
test_data['is_weekend'] = (test_data['day_of_week'] >= 5).astype(int)

test_data = test_data.drop('date', axis=1)

# one-hot encoding 
test_data_encoded = pd.get_dummies(test_data, columns=['country', 'store', 'product'], drop_first=True)

# Align the test data with training data (handle missing columns)
missing_cols = set(data_encoded.columns) - set(test_data_encoded.columns)
for col in missing_cols:
    test_data_encoded[col] = 0  # Add missing columns with default value

# Ensure column order matches training data
test_data_encoded = test_data_encoded[data_encoded.columns.drop('num_sold')]

# Predict using the trained model
predictions = model.predict(test_data_encoded)

final_predictions = pd.DataFrame({
    "id": test_data["id"],  
    "num_sold": predictions 
})
final_predictions.to_csv("final_predictions.csv", index=False)
print("Predictions saved to 'final_predictions.csv'.")

Predictions saved to 'final_predictions.csv'.
