In [None]:
import pandas as pd
import matplotlib as mpl
import os

store = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/stores.csv")
train = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/train.csv")
test = pd.read_csv("/kaggle/input/store-sales-time-series-forecasting/test.csv")
oil = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/oil.csv')
holidays = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv')
transactions = pd.read_csv('/kaggle/input/store-sales-time-series-forecasting/transactions.csv')

In [None]:
store.tail()

In [None]:
train.tail()

In [None]:
test.tail()

In [None]:
oil['dcoilwtico'].fillna(oil['dcoilwtico'].median(),inplace=True)

In [None]:
oil.head()

In [None]:
holidays.head()

In [None]:
transactions.head()

In [None]:
id_test = test['id']

In [None]:
# Merge the CSV files into train data
train = pd.merge(train, store, how='left', on='store_nbr')
train = pd.merge(train, oil, how='left', on='date')
train = pd.merge(train, transactions, how='left', on=['date', 'store_nbr'])
train = pd.merge(train, holidays, how='left', on='date')

# Display the first few rows of the merged train data
train.head()

In [None]:
# Merge the CSV files into test data
test = pd.merge(test, store, how='left', on='store_nbr')
test = pd.merge(test, oil, how='left', on='date')
test = pd.merge(test, transactions, how='left', on=['date', 'store_nbr'])
test = pd.merge(test, holidays, how='left', on='date')

# Display the first few rows of the merged test data
test.head()

In [None]:
full_data = [train, test]

train['date'] = pd.to_datetime(train['date'])

#represent month in date field as its first day
train['date'] = train['date'].dt.year.astype('str') + '-' + train['date'].dt.month.astype('str') + '-01'
train['date'] = pd.to_datetime(train['date'])

#groupby date and sum the sales
train = train.groupby('date').sales.sum().reset_index()

train_data = full_data[0]
test_data = full_data[1]
train_data.head()

In [None]:
test.head()

In [None]:
train_data = train_data.drop(['id','date','city','description','type_y','transferred','locale','locale_name'],axis=1)
test_data = test_data.drop(['id','date','city','description','type_y','transferred','locale','locale_name'],axis=1)
train_data.head()

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
train_data = train_data.interpolate(method='pad')
test_data = test_data.interpolate(method='pad') 

In [None]:
train_data.isnull().sum()

In [None]:
test_data.isnull().sum()

In [None]:
tr = train_data.bfill()
tr.isnull().sum()

In [None]:
te = test_data.bfill()
te.isnull().sum()

In [None]:
te = te.fillna(0)
te.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.title('Store_nbr')
sns.boxplot(tr['store_nbr'])
plt.show()

In [None]:
plt.title('cluster')
sns.boxplot(tr['cluster'])
plt.show()

In [None]:
plt.title('oil')
sns.boxplot(tr['dcoilwtico'])
plt.show()

In [None]:
plt.title('Transaction')
sns.boxplot(tr['transactions'])
plt.show()

In [None]:
q1 = tr['transactions'].quantile(0.25)
q3 = tr['transactions'].quantile(0.75)
iqr = q3 - q1
lower_limit = q1 - 1.5*iqr
upper_limit = q3 + 1.5*iqr
lower_limit,upper_limit
tr1 = tr[(tr['transactions']>lower_limit) & (tr['transactions']<upper_limit)]
tr1.shape

In [None]:
plt.title('Transaction')
sns.boxplot(tr1['transactions'])
plt.show()

In [None]:
print(tr1.dtypes)

In [None]:
for col in tr1.columns:
    if tr1[col].dtype == 'object':
        tr1[col] = tr1[col].astype('category')

In [None]:
print(tr1.dtypes)

In [None]:
for col in te.columns:
    if te[col].dtype == 'object':
        te[col] = te[col].astype('category')

In [None]:
print(te.dtypes)

In [None]:
X = tr1.drop(['sales'],axis=1)
X.head()

In [None]:
Y = tr1['sales']
Y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=10)

In [None]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor(n_estimators=1000, learning_rate=0.1, enable_categorical=True)
xgb_model.fit(X_train, Y_train)
xgb_pred = xgb_model.predict(te)

In [None]:
xgb_model.score(X_train,Y_train)

In [None]:
xgb_model.score(X_test,Y_test)

In [None]:
import lightgbm as lgb

lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.1, enable_categorical=True)
lgb_model.fit(X_train, Y_train)
lgb_pred = lgb_model.predict(te)

In [None]:
lgb_model.score(X_train,Y_train)

In [None]:
lgb_model.score(X_test,Y_test)

In [None]:
# from catboost import CatBoostRegressor

# cat_features = [i for i, col in enumerate(X_train.columns) if X_train[col].dtype.name == 'category']

# catboost_model = CatBoostRegressor(n_estimators=1000, learning_rate=0.1, verbose=0)
# catboost_model.fit(X_train, Y_train, cat_features=cat_features)
# catboost_pred = catboost_model.predict(te)

In [None]:
# catboost_model.score(X_train,Y_train)

In [None]:
# catboost_model.score(X_test,Y_test)

In [None]:
pred = xgb_pred*0.6 + lgb_pred*0.4

# Display predictions with id and sales
results = pd.DataFrame({
    'id': id_test,
    'sales': pred
})

xgb_results = pd.DataFrame({'id': id, 'sales': xgb_pred})
lgb_results = pd.DataFrame({'id': id, 'sales': lgb_pred})

results.head()

In [None]:
results.to_csv("mantappuuu.csv",index=False)
print('Mantapu')