In [1]:
import os
import joblib
# Core
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
from imblearn.over_sampling import SMOTE
import itertools
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import time

# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
# from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
# import eli5
# from eli5.sklearn import PermutationImportance
from sklearn.utils import resample

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Get path from training and test dataset

curr_path = os.getcwd()
dataset_src = os.path.join(curr_path, 'dataset')

sales_train = pd.read_csv(os.path.join(dataset_src, 'sales_train.csv'), parse_dates=["date"])
test = pd.read_csv(os.path.join(dataset_src, 'test.csv'), index_col=['ID'])
item_categories = pd.read_csv(os.path.join(dataset_src, 'item_categories.csv'))
items = pd.read_csv(os.path.join(dataset_src, 'items.csv'))
shops = pd.read_csv(os.path.join(dataset_src, 'shops.csv'))
sample_submission = pd.read_csv(os.path.join(dataset_src, 'sample_submission.csv'))


In [None]:
sales_train.groupby('item_id')['item_cnt_day'].sum()

In [None]:
test

In [None]:
print(sales_train.columns)
print(test.columns)
print('number of features', len(test.columns))

In [None]:
print(len(sales_train), 'records for the training dataset.')
print(len(test), 'records for the testing dataset. ')

In [None]:
total_sales = sales_train.item_cnt_day.sum()
total_shops = len(sales_train.shop_id)

print( total_sales, 'sales.')
print( total_shops, 'shops.')

In [None]:
plt.hist(sales_train.item_cnt_day)

In [None]:
# Feature exploration

list_features = sales_train.columns
print('They are',len(list_features),'features in the dataset.')
print('----------------')
for f in list_features:
    print('feature:', f, '|| Type:', type(sales_train[f][0]), '|| Example:', sales_train[f][0], '|| number of unique values', len(sales_train[f].unique()) )

In [None]:
len(items['item_id'].unique())

In [None]:
item_categories.head(50)

In [None]:
item_categories

In [None]:



item_categories['category']=np.nan
item_categories['dept']=np.nan
item_categories['category']=item_categories['item_category_name'].apply(lambda x: x.split('-')[0])
item_categories['dept']=item_categories['item_category_name'].apply(lambda x: ''.join(x.split('-')[1:]))

    

    
item_categories

In [None]:
data = items.merge(item_categories)
data.tail(50).sort_values(by='item_id')

In [None]:
items.tail(50).sort_values(by='item_id')

In [None]:
data = sales_train.merge(data)

In [None]:
data = data.merge(shops)

In [None]:
data.head()

In [None]:
data

In [None]:

quantitative = [f for f in data.columns if data.dtypes[f] != 'object']

qualitative = [f for f in data.columns if data.dtypes[f] == 'object']

In [None]:
quantitative

In [None]:
qualitative

In [None]:

print(data.columns)
print('number of features', len(data.columns))

In [None]:
print(data.isnull().sum())


In [None]:
data[['dept']].value_counts()

In [None]:
for x in data.groupby(['category']):
    for i in x:
        print(i)

In [None]:
data[['date_block_num', 'item_cnt_day']].sort_values(by='date_block_num')

In [None]:
data.groupby(['date_block_num']).item_cnt_day.sum()
   

In [None]:


item_cnt_month =  data[['date_block_num', 'item_cnt_day']].sort_values(by='date_block_num')

item_cnt_month.groupby(['date_block_num']).sum()
# for i in item_cnt_month:
#     print(i)



In [None]:
item_cnt_month =  data[['date_block_num', 'item_cnt_day']].groupby(['date_block_num']).sum().sort_values(by='date_block_num')

for i in range(1, len(item_cnt_month)):
    print(item_cnt_month[i-1:i], i-1)
    
# for i in item_cnt_month:
#     print(i)


In [None]:
data[data['date_block_num'] == 0]



In [None]:
data[data['date_block_num'].isin([0, 1])]


In [51]:
date.loc[date['date_block_num'] < 1, 'item_cnt_day']


0         1.0
1         1.0
2        -1.0
3         1.0
4         1.0
         ... 
115685    1.0
115686    1.0
115687    2.0
115688    1.0
115689    1.0
Name: item_cnt_day, Length: 115690, dtype: float64

In [None]:
data.plot.scatter(x='date_block_num',
                  y='item_cnt_day',
                  alpha=0.5)

In [None]:
[x for x in dir(data.plot) if not x.startswith("_")]


In [None]:
data.plot.box()

In [None]:
G = pd.read_csv(os.path.join(dataset_src, 'sales_train.csv'))
P = pd.read_csv(os.path.join(dataset_src, 'sales_train.csv'), index_col=0, parse_dates=True)



In [None]:
P['item_cnt_month'] = abs(P['item_cnt_day'] * 30)
P

In [None]:
G


In [None]:
P.rename(columns=str.lower)

In [None]:
# print(P['item_cnt_day'].mean() * P['date_block_num'].value_counts())
P['date_block_num'].describe()

In [21]:
data[data['date_block_num'] == 0].describe()


NameError: name 'data' is not defined

In [None]:
data[data['date_block_num'] == 0].agg({'item_price': ['min', 'max', 'median', 'skew'],
                                       'item_cnt_day': ['min', 'max', 'median', 'mean']})

In [None]:
item_cnt_month = P.groupby('date_block_num')['item_cnt_day'].sum()

In [None]:
P.iloc[9:25, 2:5]


In [None]:
P.iloc[0:3, 3] = "anonymous"
P

In [None]:
P.pivot_table(values="item_cnt_day", index="date", columns="date_block_num", aggfunc="mean")

In [None]:
sales_train_ = sales_train.sort_values(by='item_id')

In [None]:
sales_train_.concat(test.sort_values(by='item_id'))

In [None]:
In [53]: result = pd.merge(left, right, on='B', how='outer', validate="one_to_one")one_to_many


In [None]:
In [63]: pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_column')


In [None]:
sales_train.join(test, on='item_id')

In [14]:

# New feature - Group

date =  sales_train[['date', 'date_block_num', 'item_id', 'item_cnt_day', 'shop_id']]
date['day'] = date['date'].apply(lambda x: x.split('.')[0]).astype(int)
date['month'] = date['date'].apply(lambda x: x.split('.')[1]).astype(int)
date['year'] = date['date'].apply(lambda x: x.split('.')[2]).astype(int)

date.tail(50)




Unnamed: 0,date,date_block_num,item_id,item_cnt_day,shop_id,day,month,year
2935799,13.10.2015,33,7524,1.0,25,13,10,2015
2935800,08.10.2015,33,7529,1.0,25,8,10,2015
2935801,29.10.2015,33,7531,1.0,25,29,10,2015
2935802,19.10.2015,33,7583,1.0,25,19,10,2015
2935803,22.10.2015,33,7590,1.0,25,22,10,2015
2935804,29.10.2015,33,7610,1.0,25,29,10,2015
2935805,19.10.2015,33,7612,1.0,25,19,10,2015
2935806,11.10.2015,33,7615,1.0,25,11,10,2015
2935807,11.10.2015,33,7624,1.0,25,11,10,2015
2935808,19.10.2015,33,7627,1.0,25,19,10,2015


In [29]:
item_cnt_month = date.groupby(['year', 'month'])['item_cnt_day'].sum()
item_cnt_month.to_frame()
# data_ = pd.merge(date, item_cnt_month, on='month')
# data_

Unnamed: 0_level_0,Unnamed: 1_level_0,item_cnt_day
year,month,Unnamed: 2_level_1
2013,1,131479.0
2013,2,128090.0
2013,3,147142.0
2013,4,107190.0
2013,5,106970.0
2013,6,125381.0
2013,7,116966.0
2013,8,125291.0
2013,9,133332.0
2013,10,127541.0


In [42]:
item_cnt_month = date.groupby(['year', 'month'])['item_cnt_day'].sum()

In [30]:
date[date['date_block_num'] == item_cnt_month['month']]

KeyError: 'month'

In [40]:
date[date['date_block_num'] == 0]

Unnamed: 0,date,date_block_num,item_id,item_cnt_day,shop_id,day,month,year
0,02.01.2013,0,22154,1.0,59,2,1,2013
1,03.01.2013,0,2552,1.0,25,3,1,2013
2,05.01.2013,0,2552,-1.0,25,5,1,2013
3,06.01.2013,0,2554,1.0,25,6,1,2013
4,15.01.2013,0,2555,1.0,25,15,1,2013
...,...,...,...,...,...,...,...,...
115685,19.01.2013,0,32,1.0,46,19,1,2013
115686,18.01.2013,0,32,1.0,46,18,1,2013
115687,26.01.2013,0,35,2.0,46,26,1,2013
115688,31.01.2013,0,621,1.0,46,31,1,2013


In [None]:
date.groupby('date_block_num')['']

In [59]:
date['date_cnt_month']=np.nan
date.loc[date['date_block_num']==,'Age_group']='Age_0-12'

for i in range(len(item_cnt_month)):
    date.loc[date['date_block_num']== i,'date_cnt_month']=item_cnt_month[i]

date

SyntaxError: invalid syntax (4146971746.py, line 2)

In [55]:
date

Unnamed: 0,date,date_block_num,item_id,item_cnt_day,shop_id,day,month,year
0,02.01.2013,0,22154,1.0,59,2,1,2013
1,03.01.2013,0,2552,1.0,25,3,1,2013
2,05.01.2013,0,2552,-1.0,25,5,1,2013
3,06.01.2013,0,2554,1.0,25,6,1,2013
4,15.01.2013,0,2555,1.0,25,15,1,2013
...,...,...,...,...,...,...,...,...
2935844,10.10.2015,33,7409,1.0,25,10,10,2015
2935845,09.10.2015,33,7460,1.0,25,9,10,2015
2935846,14.10.2015,33,7459,1.0,25,14,10,2015
2935847,22.10.2015,33,7440,1.0,25,22,10,2015
