In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

In [51]:
df=pd.read_csv("Australia_Grocery_2022Sep.csv")

In [None]:
df.head(10)

In [None]:
df.isnull().sum()

In [52]:
df = df[['Product_Name', 'Category']]
df.head()

Unnamed: 0,Product_Name,Category
0,RSPCA Approved Chicken Necks,Meat & seafood
1,RSPCA Approved Chicken Livers,Meat & seafood
2,RSPCA Approved Chicken Giblets,Meat & seafood
3,RSPCA Approved Chicken Frames,Meat & seafood
4,RSPCA Chicken Schnitzel Plain Crumb,Meat & seafood


In [53]:
# Assuming your DataFrame is named df

# Find all duplicate rows
duplicate_rows = df[df.duplicated()]

# Count the number of duplicate rows
num_duplicates = duplicate_rows.shape[0]

print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 480889


In [54]:
# Remove duplicate rows
df = df.drop_duplicates()

# Reset the index (optional, but useful to avoid gaps in index numbers after removing duplicates)
df = df.reset_index(drop=True)

# Display the cleaned DataFrame
print(df)

                                         Product_Name        Category
0                        RSPCA Approved Chicken Necks  Meat & seafood
1                       RSPCA Approved Chicken Livers  Meat & seafood
2                      RSPCA Approved Chicken Giblets  Meat & seafood
3                       RSPCA Approved Chicken Frames  Meat & seafood
4                 RSPCA Chicken Schnitzel Plain Crumb  Meat & seafood
...                                               ...             ...
7746                       Soft Drink Max Mango 375mL          Drinks
7747                               Beef Topside Roast  Meat & seafood
7748  Signature Series Dry Ginger Mixers Bottle 300mL          Drinks
7749                               Pork Sausage Mince  Meat & seafood
7750               Lindor Assorted Dark Chocolate Bag          Pantry

[7751 rows x 2 columns]


In [55]:
df['Category'].unique()

array(['Meat & seafood', 'Fruit & vegetables', 'Dairy, eggs & fridge',
       'Bakery', 'Pantry', 'Drinks'], dtype=object)

In [56]:
df['Category'] = df['Category'].replace({'Fruit & vegetables': 'Fresh Produce', 
                                         'Dairy, eggs & fridge': 'Cold Storage',
                                        'Bakery': 'Pantry', 'Meat & seafood': 'Meat'})

In [57]:
import nltk
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
nltk.download('punkt') # tokenizer
nltk.download('stopwords') # 'useless words'
nltk.download('wordnet') # a library for finding synonyms antonyms and so on
nltk.download('averaged_perceptron_tagger') # tagging data
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/abhishek/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abhishek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abhishek/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/abhishek/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package words to /Users/abhishek/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [58]:

import string

def filterPunctuations(tokens):
    '''Removes punctuations from the array of tokens and returns the resultant array'''
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    return words
     

In [59]:

def processProductName(product):

    # break the sentence into words
    tokens = word_tokenize(product)

    
    #remove punctuation
    tokens = filterPunctuations(tokens)
 
    
    #convert to lower case
    tokens = [w.lower() for w in tokens]
  

    #remove stopwords
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words=list(filter(lambda word:word not in stop_words,tokens))   

    
    #lemmatize the words
    wordnet_lemmatizer=WordNetLemmatizer()
    words=[wordnet_lemmatizer.lemmatize(word) for word in words]
 
    
    #unique words
    words=pd.Series(words).drop_duplicates().tolist()
    return words

In [60]:
data1=[]

for index, row in df.iterrows(): 
    product=row["Product_Name"]
    words=processProductName(product)
    data1.append({"text":product,"token_list":words,"token":' '.join(words),"target":row["Category"]})
df1=pd.DataFrame(data1)

#df1['token'] = df1.apply(
#    lambda row: remove_matching_words(row['token'], stemmed_list) if row['target'] != 'Fresh Produce' else row['token'],
#    axis=1
#)

df1.head(5)

Unnamed: 0,text,token_list,token,target
0,RSPCA Approved Chicken Necks,"[rspca, approved, chicken, neck]",rspca approved chicken neck,Meat
1,RSPCA Approved Chicken Livers,"[rspca, approved, chicken, liver]",rspca approved chicken liver,Meat
2,RSPCA Approved Chicken Giblets,"[rspca, approved, chicken, giblet]",rspca approved chicken giblet,Meat
3,RSPCA Approved Chicken Frames,"[rspca, approved, chicken, frame]",rspca approved chicken frame,Meat
4,RSPCA Chicken Schnitzel Plain Crumb,"[rspca, chicken, schnitzel, plain, crumb]",rspca chicken schnitzel plain crumb,Meat


In [None]:
df1

In [61]:
fg_list = ['Cabbage', 'Cucumber', 'Chillies', 'Potatoes', 'Carrot', 
           'Beans', 'Kiwifruit', 'Sprouts', 'Avocado', 'Onion', 'Beetroot', 'Spinach',
          'Onions','Watermelon', 'Grape', 'Murcott', 'Leaf', 'Chilli', 'Garlic', 'Veggie', 'Blackberries',
          'Broccoli', 'Turmeric', 'Asparagus', 'Drumhead', 'Pumpkin', 'Cloves', 'Passionfruit',
          'Oranges', 'Cherries', 'Mandarins', 'Apple', 'Grapes', 'Ginger','Capsicums', 'Melons',
          'Peaches', 'Lettuce', 'Peas', 'Capsicum', 'Cauliflower', 'Tomatoes', 'Limes', 'Corn', 'Blueberries',
          'Jalapeno', 'Mangoes', 'Radish', 'Lemons', 'Cherry', 'Strawberries', 'Pineapple', 'Bananas', 'Turnips',
          'Orange', 'Grapefruit', 'Carrots', 'Mushrooms']

In [62]:
def remove_matching_words(df, stemmed_fg_list):
    # Create two new dataframes to store the balanced and discluded data
    balanced_df = pd.DataFrame(columns=df.columns)
    disclude_df = pd.DataFrame(columns=df.columns)
    
    # Iterate over each category in 'target' excluding 'Fresh Produce'
    for cat in df['target'].unique():
        if cat == 'Fresh Produce':
            continue
        
        # For each word in the stemmed_fg_list
        for val in stemmed_fg_list:
            
            # Find items in 'Fresh Produce' that contain the current word
            fresh_produce_items = df[(df['token'].str.contains(val, case=False, na=False)) & 
                                     (df['target'] == 'Fresh Produce')]
            
            # Find items in the current category that contain the current word
            fruits_df = df[(df['token'].str.contains(val, case=False, na=False)) & 
                           (df['target'] == cat)]
            
            # Determine how many items to keep to balance the categories
            num_to_keep = min(len(fruits_df), len(fresh_produce_items))
            
            # Select the balanced number of items from the current category
            balanced_fruits = fruits_df.head(num_to_keep)
            
            # The remaining fruits that were not selected
            remaining_fruits = fruits_df.loc[~fruits_df.index.isin(balanced_fruits.index)]
            
            # Append the balanced items to the balanced_df
            balanced_df = pd.concat([balanced_df, balanced_fruits], ignore_index=True)
            
            # Append the unselected (excluded) items to disclude_df
            disclude_df = pd.concat([disclude_df, remaining_fruits], ignore_index=True)
    
    # Exclude all the rows in disclude_df from the original dataframe
    disclude_indexes = disclude_df.index
    remaining_df = df.loc[~df.index.isin(disclude_indexes)]
    
    # Combine the balanced dataframe with the remaining rows that were not discluded
    final_df = pd.concat([balanced_df, remaining_df], ignore_index=True)
    
    return final_df

In [63]:
final_df = remove_matching_words(df1, fg_list)

In [64]:
final_df['target'].unique()

array(['Meat', 'Cold Storage', 'Pantry', 'Drinks', 'Fresh Produce'],
      dtype=object)

In [65]:
len(final_df)

7514

In [217]:
# Assuming 'column_name' is the categorical column
category_counts = final_df['target'].value_counts()

print(category_counts)

target
Pantry           4521
Cold Storage     1309
Drinks           1040
Meat              389
Fresh Produce     255
Name: count, dtype: int64


In [199]:
balance_df = final_df[final_df['target'].isin(['Fresh Produce', 'Meat', 'Cold Storage', 'Drinks'])]
len(balance_df)

2993

In [200]:
balance_df = pd.concat([balance_df, final_df[final_df['target'].isin(['Pantry'])].head(1500)])
len(balance_df)

4493

In [146]:
fruits_df = final_df[final_df['target'] == 'Cold Storage']
fruits_df = fruits_df[fruits_df['token'].str.contains('banana', case=False, na=False)]
len(fruits_df)

21

In [132]:
from sklearn.model_selection import train_test_split

In [202]:
X=balance_df.iloc[:,2]
y=balance_df.iloc[:,3]
     

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [203]:
y_train

25               Meat
802      Cold Storage
7083    Fresh Produce
2172           Pantry
317     Fresh Produce
            ...      
2447           Pantry
610      Cold Storage
191            Pantry
1793           Pantry
1004     Cold Storage
Name: target, Length: 3145, dtype: object

In [204]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vectorizer = CountVectorizer()
X_train_cv=vectorizer.fit_transform(X_train)
transformer = TfidfTransformer()
X_train_tfidf = transformer.fit_transform(X_train_cv)
X_test_cv=vectorizer.transform(X_test)
X_test_tfidf = transformer.transform(X_test_cv)

In [205]:
naive_bayes={"Naive Bayes":MultinomialNB()}
     

from sklearn.model_selection import cross_val_score,KFold
from sklearn.metrics import f1_score

def eval_model(models):
    cv = KFold(n_splits=10)
    scores=[]
    for model in models:
        clf=models[model]
        clf.fit(X_train_tfidf, y_train)
        y_pred=clf.predict(X_test_tfidf)
        f1=cross_val_score(clf, X_train_tfidf, y_train, cv=cv,scoring="f1_micro")
        scores.append({"classifier":model,"cross-validated F1 score":f1.mean()})
    scores_df=pd.DataFrame(scores)
    print(scores_df.head())
    
eval_model(naive_bayes)

    classifier  cross-validated F1 score
0  Naive Bayes                  0.904296


In [206]:
knn={"KNN":KNeighborsClassifier()}
eval_model(knn)

  classifier  cross-validated F1 score
0        KNN                  0.899525


In [207]:
dt={"Decision Tree":DecisionTreeClassifier()}
eval_model(dt)

      classifier  cross-validated F1 score
0  Decision Tree                   0.89413


In [208]:
svm = {"SVM":SVC(C=1.0, kernel='linear', gamma='auto')}
eval_model(svm)

  classifier  cross-validated F1 score
0        SVM                  0.937036


In [209]:
def vectorize(value):
    cv=vectorizer.transform(value)
    tfidf = transformer.transform(cv)
    return tfidf
     

def predict_category(text):
    X_test=pd.DataFrame([text],columns=["token"])
    results=[]
    for index, row in X_test.iterrows(): 
        product=row["token"]
        words=' '.join(processProductName(product))
        results.append(words)
    X_test=vectorize(results)
    return svm["SVM"].predict(X_test)[0]
     

category=predict_category(input("Enter the product name : "))
print("The predicted category is: ",category)

Enter the product name : banana
The predicted category is:  Cold Storage


In [160]:
prod_data = pd.read_csv('filtered_australia_products.csv', sep = '\t', low_memory=False)

In [161]:
prod_data = prod_data[['code', 'main_category', 'product_name']]

In [162]:
prod_data.head()

Unnamed: 0,code,main_category,product_name
0,93718738,,Horseradish cream
1,1007,,PURINA FIEST 0CN/WESH TUNA85GM
2,123457033,,Almond Meal
3,680000017,,100% Australian Tea Tree Oil
4,680000024,,100% Australian Tea Tree Oil


In [210]:
for idx, row in prod_data.iterrows():
    if (pd.isna(row['product_name'])):
        prod_data.loc[idx, 'main_category'] = 'Pantry'
    else:
        prod_data.loc[idx, 'main_category'] = predict_category(row['product_name'])

In [215]:
prod_data.iloc[200:250]

Unnamed: 0,code,main_category,product_name
200,34000470754,Pantry,Reeses
201,34000702596,Pantry,Reese's 2 Peanut Butter Cups
202,34000702602,Pantry,Miniature Peanut Butter Cups
203,34027985194,Drinks,Bare ginger uncrystallised
204,341477,Fresh Produce,FAREX BABY PORRIDGE
205,34856002062,Pantry,Welches
206,34856014164,Pantry,Welch’s
207,34856014171,Pantry,Fruit Snacks
208,34856014683,Pantry,Welch
209,34856201465,Pantry,Fruit snacks


In [219]:
fruits_df = prod_data[prod_data['code'] == '9415767068049']
fruits_df

Unnamed: 0,code,main_category,product_name
47335,9415767068049,Cold Storage,Boss Ice Double Espresso Coffee


In [221]:
code_data = prod_data[['code']]

In [222]:
# Assuming your DataFrame is named df

# Find all duplicate rows
duplicate_rows = code_data[code_data.duplicated()]

# Count the number of duplicate rows
num_duplicates = duplicate_rows.shape[0]

print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 0


In [220]:
# Assuming your DataFrame is named df

# Find all duplicate rows
duplicate_rows = prod_data[prod_data.duplicated()]

# Count the number of duplicate rows
num_duplicates = duplicate_rows.shape[0]

print("Number of duplicate rows:", num_duplicates)

Number of duplicate rows: 0


In [224]:
prod_data.to_csv('clean_data.csv', index=False, float_format='%.20g') 