In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from scipy.sparse import csr_matrix, hstack

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import string

from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder

from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [None]:
aisles = pd.read_csv('../instacart/aisles.csv')
departments = pd.read_csv('../instacart/departments.csv')
# orderproducts_prior = pd.read_csv('../instacart/order_products__prior.csv')
orderproducts = pd.read_csv('../instacart/order_products__train.csv')
orders = pd.read_csv('../instacart/orders.csv')
products = pd.read_csv('../instacart/products.csv')

In [3]:
# EDA & Data Preprocessing
# orderproducts = pd.concat([orderproducts_prior, orderproducts_tests], axis=0)
orders = orders[orders['eval_set'] != 'prior']
userproducts = orderproducts.merge(orders, on='order_id', how='inner')

In [4]:
detailed_userproducts = userproducts[['user_id','product_id','reordered']]
detailed_userproducts = detailed_userproducts.merge(products, on='product_id', how='inner')
detailed_userproducts = detailed_userproducts.merge(aisles, on='aisle_id', how='inner')
detailed_userproducts = detailed_userproducts.merge(departments, on='department_id', how='inner')
detailed_userproducts = detailed_userproducts.drop(columns=['aisle_id','department_id'])
detailed_userproducts

Unnamed: 0,user_id,product_id,reordered,product_name,aisle,department
0,112108,49302,1,Bulgarian Yogurt,yogurt,dairy eggs
1,112108,11109,1,Organic 4% Milk Fat Whole Milk Cottage Cheese,other creams cheeses,dairy eggs
2,112108,10246,0,Organic Celery Hearts,fresh vegetables,produce
3,112108,49683,0,Cucumber Kirby,fresh vegetables,produce
4,112108,43633,1,Lightly Smoked Sardines in Olive Oil,canned meat seafood,canned goods
...,...,...,...,...,...,...
1384612,169679,14233,1,Natural Artesian Water,water seltzer sparkling water,beverages
1384613,169679,35548,1,Twice Baked Potatoes,prepared meals,deli
1384614,139822,35951,1,Organic Unsweetened Almond Milk,soy lactosefree,dairy eggs
1384615,139822,16953,1,Creamy Peanut Butter,spreads,pantry


In [5]:
def text_cleaning(text):
    punctuation = string.punctuation
    stopwordlist = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    words = [word for word in words if word not in punctuation]
    words = [word for word in words if word.isalpha()]
    words = [word.lower() for word in words]
    words = [word for word in words if word not in stopwordlist]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# detailed_userproducts['product_name'] = detailed_userproducts['product_name'].apply(lambda x: text_cleaning(x))
# detailed_userproducts

In [6]:
def tfidf(text_columns, model_path='./tfidf_vectorizer.pkl'):
    if os.path.exists(model_path):
        print(f"Loading existing TFIDF model from {model_path}")
        with open(model_path, 'rb') as f:
            tfidf_vectorizer = pickle.load(f)
    else:
        print(f"Training a new TFIDF Vectorizer and saving it to {model_path}")
        tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
        tfidf_vectorizer.fit(text_columns)
        with open(model_path, 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)
    tfidf = tfidf_vectorizer.transform(text_columns)
    tfidf_df = pd.DataFrame.sparse.from_spmatrix(tfidf, index=text_columns.index)
    return tfidf, tfidf_df

# tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
# train_tfidf = tfidf_vectorizer.fit_transform(xtrain['product_name'])
# test_tfidf = tfidf_vectorizer.transform(xtest['product_name'])
# train_tfidf = tfidf(xtrain['product_name'])
# test_tfidf = tfidf(xtest['product_name'])

def bow(text_columns, model_path='./bow_vectorizer.pkl'):
    if os.path.exists(model_path):
        print(f"Loading existing bow model from {model_path}")
        with open(model_path, 'rb') as f:
            bow_vectorizer = pickle.load(f)
    else:
        print(f"Training a new bow Vectorizer and saving it to {model_path}")
        bow_vectorizer = CountVectorizer(lowercase=True, stop_words='english')
        bow_vectorizer.fit(text_columns)
        with open(model_path, 'wb') as f:
            pickle.dump(bow_vectorizer, f)
    bow = bow_vectorizer.transform(text_columns)
    bow_df = pd.DataFrame.sparse.from_spmatrix(bow, index=text_columns.index)
    return bow, bow_df

# bow_vectorizer = CountVectorizer(lowercase=True, stop_words='english')
# train_bow = bow_vectorizer.fit_transform(xtrain['product_name'])
# test_bow = bow_vectorizer.transform(xtest['product_name'])
# train_bow = bow(xtrain['product_name'])
# test_bow = bow(xtest['product_name'])

def get_product_vector(product_name, model):
    tokens = word_tokenize(product_name)
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(word_vectors) == 0: return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

def cbow(text_columns, model_path='./word2vec_cbow.model'):
    if os.path.exists(model_path):
        print(f"Loading existing Word2Vec CBOW model from {model_path}")
        semantic_cbow = Word2Vec.load(model_path)
    else:
        print(f"Training a new Word2Vec CBOW model and saving it to {model_path}")
        tokenized = text_columns.apply(word_tokenize)
        semantic_cbow = Word2Vec(sentences=tokenized, min_count=1, vector_size=100, window=5, sg=0)
        semantic_cbow.save(model_path)
    cbow_vectors = text_columns.apply(lambda x: get_product_vector(x, semantic_cbow))
    cbow_vectors_df = pd.DataFrame(cbow_vectors.tolist(), index=text_columns.index)
    return cbow_vectors, cbow_vectors_df

# train_cbow = cbow(xtrain['product_name'])
# test_cbow = cbow(xtest['product_name'])

def skipgram(text_columns, model_path='./word2vec_skipgram.model'):
    if os.path.exists(model_path):
        print(f"Loading existing Word2Vec Skipgram model from {model_path}")
        semantic_skipgram = Word2Vec.load(model_path)
    else:
        print(f"Training a new Word2Vec Skipgram model and saving it to {model_path}")
        tokenized = text_columns.apply(word_tokenize)
        semantic_skipgram = Word2Vec(sentences=tokenized, min_count=1, vector_size=100, window=5, sg=1)
        semantic_skipgram.save(model_path)
    skipgram_vectors = text_columns.apply(lambda x: get_product_vector(x, semantic_skipgram))
    skipgram_vectors_df = pd.DataFrame(skipgram_vectors.tolist(), index=text_columns.index)
    return skipgram_vectors, skipgram_vectors_df

# train_skipgram = skipgram(xtrain['product_name'])
# test_skipgram = skipgram(xtest['product_name'])

In [7]:
# train_bow_df = pd.DataFrame.sparse.from_spmatrix(train_bow, index=xtrain.index)
# test_bow_df = pd.DataFrame.sparse.from_spmatrix(test_bow, index=xtest.index)
# _, train_bow_df = bow(xtrain['product_name'])
# _, test_bow_df = bow(xtest['product_name'])

# train_tfidf_df = pd.DataFrame.sparse.from_spmatrix(train_tfidf, index=xtrain.index)
# test_tfidf_df = pd.DataFrame.sparse.from_spmatrix(test_tfidf, index=xtest.index)
# _, train_tfidf_df = tfidf(xtrain['product_name'])
# _, test_tfidf_df = tfidf(xtest['product_name'])

# train_cbow_df = pd.DataFrame(train_cbow.tolist(), index=xtrain.index)
# test_cbow_df = pd.DataFrame(test_cbow.tolist(), index=xtest.index)
# _, train_cbow_df = cbow(xtrain['product_name'])
# _, test_cbow_df = cbow(xtest['product_name'])

# train_skipgram_df = pd.DataFrame(train_skipgram.tolist(), index=xtrain.index)
# test_skipgram_df = pd.DataFrame(test_skipgram.tolist(), index=xtest.index)
# _, train_skipgram_df = skipgram(xtrain['product_name'])
# _, test_skipgram_df = skipgram(xtest['product_name'])

In [8]:
# def encode_aisle(dataset, model_path='./aisle_onehotencoder.pkl'):
#     if os.path.exists(model_path):
#         print(f"Loading existing encoder from {model_path}")
#         encoder = pd.read_pickle(model_path)
#     else:
#         print(f"Training a new aisle encoder and saving it to {model_path}")
#         encoder = OneHotEncoder(handle_unknown='ignore')
#         encoder.fit(dataset[['aisle']])
#         pd.to_pickle(encoder, model_path)
#     dataset_encoded = encoder.transform(dataset[['aisle']])
#     dataset_encoded_df = pd.DataFrame(dataset_encoded.toarray(), columns=encoder.get_feature_names_out(['aisle']), index=dataset.index)
#     return dataset_encoded, dataset_encoded_df

# def encode_department(dataset, model_path='./department_onehotencoder.pkl'):
#     if os.path.exists(model_path):
#         print(f"Loading existing encoder from {model_path}")
#         encoder = pd.read_pickle(model_path)
#     else:
#         print(f"Training a new department encoder and saving it to {model_path}")
#         encoder = OneHotEncoder(handle_unknown='ignore')
#         encoder.fit(dataset[['department']])
#         pd.to_pickle(encoder, model_path)
#     dataset_encoded = encoder.transform(dataset[['department']])
#     dataset_encoded_df = pd.DataFrame(dataset_encoded.toarray(), columns=encoder.get_feature_names_out(['department']), index=dataset.index)
#     return dataset_encoded, dataset_encoded_df

def encode_aisle(dataset, model_path='./aisle_labelencoder.pkl'):
    if os.path.exists(model_path):
        print(f"Loading existing encoder from {model_path}")
        encoder = pd.read_pickle(model_path)
    else:
        print(f"Training a new aisle encoder and saving it to {model_path}")
        encoder = LabelEncoder()
        encoder.fit(dataset['aisle'])
        pd.to_pickle(encoder, model_path)
    dataset_encoded = encoder.transform(dataset['aisle'])
    dataset_encoded_df = pd.DataFrame(dataset_encoded, columns=['aisle'], index=dataset.index)
    return dataset_encoded, dataset_encoded_df

def encode_department(dataset, model_path='./department_labelencoder.pkl'):
    if os.path.exists(model_path):
        print(f"Loading existing encoder from {model_path}")
        encoder = pd.read_pickle(model_path)
    else:
        print(f"Training a new department encoder and saving it to {model_path}")
        encoder = LabelEncoder()
        encoder.fit(dataset['department'])
        pd.to_pickle(encoder, model_path)
    dataset_encoded = encoder.transform(dataset['department'])
    dataset_encoded_df = pd.DataFrame(dataset_encoded, columns=['department'], index=dataset.index)
    return dataset_encoded, dataset_encoded_df

# encoder_aisle = OneHotEncoder(handle_unknown='ignore')
# encoder_department = OneHotEncoder(handle_unknown='ignore')

# xtrain_aisle_encoded = encoder_aisle.fit_transform(xtrain[['aisle']])
# xtest_aisle_encoded = encoder_aisle.transform(xtest[['aisle']])
# xtrain_department_encoded = encoder_department.fit_transform(xtrain[['department']])
# xtest_department_encoded = encoder_department.transform(xtest[['department']])

# xtrain_aisle_encoded = pd.DataFrame(xtrain_aisle_encoded.toarray(), columns=encoder_aisle.get_feature_names_out(['aisle']), index=xtrain.index)
# xtest_aisle_encoded = pd.DataFrame(xtest_aisle_encoded.toarray(), columns=encoder_aisle.get_feature_names_out(['aisle']), index=xtest.index)
# xtrain_department_encoded = pd.DataFrame(xtrain_department_encoded.toarray(), columns=encoder_department.get_feature_names_out(['department']), index=xtrain.index)
# xtest_department_encoded = pd.DataFrame(xtest_department_encoded.toarray(), columns=encoder_department.get_feature_names_out(['department']), index=xtest.index)

# xtrain = xtrain.drop(columns=['aisle', 'department'])
# xtest = xtest.drop(columns=['aisle', 'department'])

# xtrain = pd.concat([xtrain, xtrain_aisle_encoded, xtrain_department_encoded], axis=1)
# xtest = pd.concat([xtest, xtest_aisle_encoded, xtest_department_encoded], axis=1)


In [9]:
def extract_feature(dataset):
    print('Cleaning & preprocess text for feature extraction... (1/6)')
    dataset['product_name'] = dataset['product_name'].apply(lambda x: text_cleaning(x))
    dataset = dataset.reset_index(drop=True)

    print('Product name feature extraction... (2/6)')
    _, tfidf_vector = tfidf(dataset['product_name'])
    tfidf_vector = tfidf_vector.reset_index(drop=True)

    print('Aisle name one-hot encoding... (3/6)')
    _, aisle_encoded = encode_aisle(dataset)
    aisle_encoded = aisle_encoded.reset_index(drop=True)

    print('Department name one-hot encoding... (4/6)')
    _, department_encoded = encode_department(dataset)
    department_encoded = department_encoded.reset_index(drop=True)

    print('Combining all features csr sparse matrix... (5/6)')
    aisle_encoded_sparse = csr_matrix(aisle_encoded.values)
    department_encoded_sparse = csr_matrix(department_encoded.values)
    features_only = hstack([tfidf_vector, aisle_encoded_sparse, department_encoded_sparse])
    features_only = features_only.tocsr()
    userproduct_indexes = dataset[['user_id','product_id','reordered']]

    return features_only, userproduct_indexes


In [10]:
trainset, testset = train_test_split(detailed_userproducts, test_size=0.2, random_state=42)

In [11]:
train_features, train_userproduct_indexes = extract_feature(trainset)

# train_combined.columns = train_combined.columns.astype(str)
# train_features.columns = train_features.columns.astype(str)
# train_userproduct_indexes.columns = train_userproduct_indexes.columns.astype(str)

train_features

Cleaning & preprocess text for feature extraction... (1/6)
Product name feature extraction... (2/6)
Loading existing TFIDF model from ./tfidf_vectorizer.pkl
Aisle name one-hot encoding... (3/6)
Training a new aisle encoder and saving it to ./aisle_labelencoder.pkl
Department name one-hot encoding... (4/6)
Training a new department encoder and saving it to ./department_labelencoder.pkl
Combining all features csr sparse matrix... (5/6)


MemoryError: Unable to allocate 62.7 GiB for an array with shape (7599, 1107693) and data type float64

In [None]:
test_features, test_userproduct_indexes = extract_feature(testset)

# test_combined.columns = test_combined.columns.astype(str)
# test_features.columns = test_features.columns.astype(str)
# test_userproduct_indexes.columns = test_userproduct_indexes.columns.astype(str)

test_features

Cleaning & preprocess text for feature extraction... (1/6)
Product name feature extraction... (2/6)
Loading existing TFIDF model from ./tfidf_vectorizer.pkl
Aisle name one-hot encoding... (3/6)
Loading existing encoder from ./aisle_encoder.pkl
Department name one-hot encoding... (4/6)
Loading existing encoder from ./department_encoder.pkl
Removing raw feature columns... (5/6)
Reset indexes and setup for model training... (6/6)


Unnamed: 0,user_id,product_id,reordered,0,1,2,3,4,5,6,...,department_household,department_international,department_meat seafood,department_missing,department_other,department_pantry,department_personal care,department_pets,department_produce,department_snacks
0,149174,37203,0,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11333,35108,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,31224,37141,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,137296,12834,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,133811,48775,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276919,205958,19219,1,0,0,0,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276920,31185,4920,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
276921,189092,22035,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
276922,42945,12614,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
# # Step 3: Train the model on the training set using product features
# train_combined.columns = train_combined.columns.astype(str)
# train_features.columns = train_features.columns.astype(str)
# train_userproduct_indexes.columns = train_userproduct_indexes.columns.astype(str)
# test_combined.columns = test_combined.columns.astype(str)
# test_features.columns = test_features.columns.astype(str)
# test_userproduct_indexes.columns = test_userproduct_indexes.columns.astype(str)

# train_features = csr_matrix(train_features)
# test_features = csr_matrix(test_features)

model = NearestNeighbors(n_neighbors=5, metric='cosine', n_jobs=-1)
model.fit(train_features)

# Step 4: Make recommendations on the test set (or for a specific user)
user = testset['user_id'].unique()[0]  # Example: First user in the test set
userpurchases = testset[testset['user_id'] == user]['product_id']

recommendations = {}
for product in userpurchases:
    # Extract feature vector for the product from the test set
    product_vector = test_features.loc[test_features['product_id'] == product].values.flatten()
    
    # Get similar products
    similarity, indices = model.kneighbors([product_vector])
    
    # Map the indices back to product IDs
    recommended_product_ids = test_combined.iloc[indices[0]]['product_id'].values
    recommendations[product] = recommended_product_ids

# Display recommendations for the test user
print(recommendations)


In [None]:
train_features.dtypes

reordered                                int64
0                           Sparse[float64, 0]
1                           Sparse[float64, 0]
2                           Sparse[float64, 0]
3                           Sparse[float64, 0]
                                   ...        
department_pantry                      float64
department_personal care               float64
department_pets                        float64
department_produce                     float64
department_snacks                      float64
Length: 7755, dtype: object