In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, log_loss, r2_score
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.svm import SVC, SVR
from sklearn.linear_model import ElasticNet, LinearRegression
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import BaseEnsemble, VotingClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier, StackingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings('ignore')

In [135]:
train = pd.read_csv('train_v9rqX0R.csv')
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [136]:
train.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [137]:
train['Item_Fat_Content'].unique()

array(['Low Fat', 'Regular', 'low fat', 'LF', 'reg'], dtype=object)

In [138]:
train['Item_Fat_Content'].replace({'low fat': 'Low Fat', 
                                  'LF': 'Low Fat',
                                  'reg': 'Regular'}, inplace=True)

In [139]:
train_items = train[['Item_Identifier', 'Item_Weight']]

In [140]:
train_items.isna().sum()

Item_Identifier       0
Item_Weight        1463
dtype: int64

In [141]:
train_items.drop_duplicates(inplace=True)

In [142]:
train_items.dropna(inplace=True)

In [143]:
train_items.isna().sum()

Item_Identifier    0
Item_Weight        0
dtype: int64

In [155]:
train1 = pd.merge(train, train_items)

In [156]:
train.isna().sum()
train1.head(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
5,FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
6,FDO10,13.65,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
7,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
8,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,4710.535
9,FDY07,11.8,Low Fat,0.0,Fruits and Vegetables,45.5402,OUT049,1999,Medium,Tier 1,Supermarket Type1,1516.0266


In [77]:
# train[train['Item_Fat_Content'] == 'low fat'] = 'Low Fat'
# train[train['Item_Fat_Content'] == 'LF'] = 'Low Fat'
# train[train['Item_Fat_Content'] == 'reg'] = 'Regular'

# train['Item_Fat_Content'].unique()


In [78]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [90]:
temp = train.groupby('Item_Identifier')
temp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001C852960F70>

In [93]:
temp['Item_Weight'].unique()

Item_Identifier
DRA12          [11.6]
DRA24    [19.35, nan]
DRA59     [8.27, nan]
DRB01     [7.39, nan]
DRB13         [6.115]
             ...     
NCZ30     [6.59, nan]
NCZ41         [19.85]
NCZ42          [10.5]
NCZ53      [9.6, nan]
NCZ54    [14.65, nan]
Name: Item_Weight, Length: 1559, dtype: object

In [96]:
temp['Item_Weight'].mean()

Item_Identifier
DRA12    11.600
DRA24    19.350
DRA59     8.270
DRB01     7.390
DRB13     6.115
          ...  
NCZ30     6.590
NCZ41    19.850
NCZ42    10.500
NCZ53     9.600
NCZ54    14.650
Name: Item_Weight, Length: 1559, dtype: float64

In [98]:
train['Item_Weight'].fillna(temp['Item_Weight'].mean())

0        9.300
1        5.920
2       17.500
3       19.200
4        8.930
         ...  
8518     6.865
8519     8.380
8520    10.600
8521     7.210
8522    14.800
Name: Item_Weight, Length: 8523, dtype: float64

In [122]:
train =train.groupby('Item_Identifier')['Item_Weight'].apply(lambda x:x.fillna(x.mean())).reset_index()
train.isna().sum()

Item_Identifier    0
level_1            0
Item_Weight        4
dtype: int64

In [166]:
train.head(10)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
5,FDP36,10.395,Regular,0.0,Baking Goods,51.4008,OUT018,2009,Medium,Tier 3,Supermarket Type2,556.6088
6,FDO10,13.65,Regular,0.012741,Snack Foods,57.6588,OUT013,1987,High,Tier 3,Supermarket Type1,343.5528
7,FDP10,,Low Fat,0.12747,Snack Foods,107.7622,OUT027,1985,Medium,Tier 3,Supermarket Type3,4022.7636
8,FDH17,16.2,Regular,0.016687,Frozen Foods,96.9726,OUT045,2002,,Tier 2,Supermarket Type1,1076.5986
9,FDU28,19.2,Regular,0.09445,Frozen Foods,187.8214,OUT017,2007,,Tier 2,Supermarket Type1,4710.535


In [17]:
def cleaner(df):
    df['Item_Fat_Content'].replace({'low fat': 'Low Fat', 
                                  'LF': 'Low Fat',
                           'reg': 'Regular'}, inplace=True)
    return df
def helper(df_train, df_test): 
    df = pd.concat([df_train, df_test])
    df = df[['Item_Identifier', 'Item_Weight']]
    df.drop_duplicates(inplace=True)
    df.dropna(inplace=True)
    return df

In [66]:
train = pd.read_csv('train_v9rqX0R.csv')
test = pd.read_csv('test_AbJTz2l.csv')



In [49]:
train.isna().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [27]:
train = cleaner(train)
test = cleaner(test)

In [99]:
train['Outlet_Size'].fillna(value='small', inplace=True)
train.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                     0
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

In [100]:
test['Outlet_Size'].fillna(value='small', inplace=True)
test.isnull().sum()

Item_Identifier                0
Item_Weight                  976
Item_Fat_Content               0
Item_Visibility                0
Item_Type                      0
Item_MRP                       0
Outlet_Identifier              0
Outlet_Establishment_Year      0
Outlet_Size                    0
Outlet_Location_Type           0
Outlet_Type                    0
dtype: int64

In [101]:
df = helper(train, test)

train = pd.merge(train, df)
test = pd.merge(test, df)
test.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
dtype: int64

In [102]:
train.isnull().sum()

Item_Identifier              0
Item_Weight                  0
Item_Fat_Content             0
Item_Visibility              0
Item_Type                    0
Item_MRP                     0
Outlet_Identifier            0
Outlet_Establishment_Year    0
Outlet_Size                  0
Outlet_Location_Type         0
Outlet_Type                  0
Item_Outlet_Sales            0
dtype: int64

In [109]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,small,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [105]:
X_train = pd.get_dummies(train.drop('Item_Outlet_Sales', axis=1), 
                         drop_first=True)

y_train =train['Item_Outlet_Sales']


In [108]:
lr = LinearRegression()

lr.fit(X_train, y_train)
y_test = pd.get_dummies(test, 
                         drop_first=True)

y_pred = lr.predict(y_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Item_Identifier_FDE52
- Item_Identifier_FDK57
- Item_Identifier_FDN52
- Item_Identifier_FDQ60
Feature names seen at fit time, yet now missing:
- Item_Identifier_DRE49
- Item_Identifier_DRN47
- Item_Identifier_FDA04
- Item_Identifier_FDF04
- Item_Identifier_FDF05
- ...


In [54]:
def outlet_func(train, test):
    df = pd.concat([train, test])
    # print(df.shape)
    df = df[['Outlet_Identifier','Outlet_Establishment_Year','Outlet_Size','Outlet_Location_Type','Outlet_Type']]
    df.drop_duplicates(inplace=True)
    # df.dropna(inplace=True)
    return df

In [53]:
outlet = outlet_func(train, test)
outlet


(14204, 12)


Unnamed: 0,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,OUT018,2009,Medium,Tier 3,Supermarket Type2
3,OUT010,1998,,Tier 3,Grocery Store
4,OUT013,1987,High,Tier 3,Supermarket Type1
7,OUT027,1985,Medium,Tier 3,Supermarket Type3
8,OUT045,2002,,Tier 2,Supermarket Type1
9,OUT017,2007,,Tier 2,Supermarket Type1
11,OUT046,1997,Small,Tier 1,Supermarket Type1
19,OUT035,2004,Small,Tier 2,Supermarket Type1
23,OUT019,1985,Small,Tier 1,Grocery Store


In [97]:
con_df =  pd.concat([train, test])
X = con_df[['Outlet_Establishment_Year','Outlet_Location_Type','Outlet_Type', 'Outlet_Size']]
y = con_df[['Outlet_Size']]

In [95]:
con_df =  pd.concat([train, test])
X = con_df[['Outlet_Establishment_Year','Outlet_Location_Type','Outlet_Type', 'Outlet_Size']]
y = con_df[['Outlet_Size']]

X_train = X.dropna()
y_train = y.dropna()


X_test = X[X['Outlet_Size'].isna()]
y_test = y[y['Outlet_Size'].isna()]

dum = X.dropna()


In [96]:
le = LabelEncoder()

dum = le.fit_transform(dum[])

y = dum['Outlet_Size']


X = dum.drop('Outlet_Size',axis=1)

ValueError: y should be a 1d array, got an array of shape (10188, 4) instead.

In [90]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)


(10188, 4)
(4016, 4)
(10188, 1)
(4016, 1)


In [91]:
from sklearn.metrics import roc_auc_score
from sklearn.naive_bayes import BernoulliNB


br = BernoulliNB()
br.fit(X_train, y_train)
y_pred = br.predict(X_test)
y_pred_prob = br.predict_proba(X_test)

print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob[:,1]))
print(log_loss(y_test, y_pred_prob))

ValueError: could not convert string to float: 'Tier 1'