# Import Necassary Libraries

In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle competitions download -c playground-series-s5e1
!unzip playground-series-s5e1.zip

Downloading playground-series-s5e1.zip to /content
  0% 0.00/2.26M [00:00<?, ?B/s]
100% 2.26M/2.26M [00:00<00:00, 222MB/s]
Archive:  playground-series-s5e1.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [28]:
train_dataset = pd.read_csv('train.csv')
test_dataset = pd.read_csv('test.csv')

train_dataset.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


In [29]:
print(train_dataset.isnull().sum())
print(test_dataset.isnull().sum())
print(train_dataset.shape)
train_dataset.dropna(inplace=True)
print(train_dataset.shape)

id             0
date           0
country        0
store          0
product        0
num_sold    8871
dtype: int64
id         0
date       0
country    0
store      0
product    0
dtype: int64
(230130, 6)
(221259, 6)


In [35]:
train_dataset.drop(columns='id', inplace=True)
print(train_dataset.head())
print(train_dataset.shape)

AttributeError: 'numpy.ndarray' object has no attribute 'drop'

In [31]:
test_id = test_dataset['id']
test_dataset.drop(columns='id', inplace=True)
print(test_dataset.head())
print(test_dataset.shape)

         date country              store             product
0  2017-01-01  Canada  Discount Stickers   Holographic Goose
1  2017-01-01  Canada  Discount Stickers              Kaggle
2  2017-01-01  Canada  Discount Stickers        Kaggle Tiers
3  2017-01-01  Canada  Discount Stickers            Kerneler
4  2017-01-01  Canada  Discount Stickers  Kerneler Dark Mode
(98550, 4)


In [32]:
train_target = train_dataset["num_sold"]
train_dataset.drop(columns='num_sold', inplace=True)

In [33]:
le = LabelEncoder()
for col in train_dataset.columns:
    train_dataset[col] = le.fit_transform(train_dataset[col])
    test_dataset[col] = le.fit_transform(test_dataset[col])
scaler =  StandardScaler()
train_dataset = scaler.fit_transform(train_dataset)
test_dataset = scaler.fit_transform(test_dataset)
print(train_dataset)
print(train_dataset.shape)

[[-1.7394777  -1.48828177 -1.24696554 -0.78005813]
 [-1.7394777  -1.48828177 -1.24696554 -0.05731413]
 [-1.7394777  -1.48828177 -1.24696554  0.66542988]
 ...
 [ 1.72941626  1.44426553 -0.01400365 -0.05731413]
 [ 1.72941626  1.44426553 -0.01400365  0.66542988]
 [ 1.72941626  1.44426553 -0.01400365  1.38817388]]
(221259, 4)


In [24]:
X_train, val_X, y_train, val_y = train_test_split(train_dataset, train_target, test_size=0.3, random_state=42)

y_train = y_train.apply(lambda x: np.log1p(x))
val_y = val_y.apply(lambda x: np.log1p(x))

In [27]:
rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(X_train, y_train)
y_pred = rfr.predict(val_X)
r2 = r2_score(val_y, y_pred)
mse = mean_squared_error(val_y, y_pred)
print(f"R2 score: {r2}")
print(f"MSE: {mse}")

R2 score: 0.9955361248020049
MSE: 0.010303823853672517


In [37]:
prediction = rfr.predict(test_dataset)
submission = pd.DataFrame({'id': test_id, 'num_sold': prediction})
submission.to_csv('submission.csv', index=False)