In [1]:
import findspark
findspark.init()

In [22]:
import pandas as pd
import pyspark
from pyspark.sql import SparkSession
import matplotlib.pyplot as plt
import os
from pyspark.sql.functions import monotonically_increasing_id,lit
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, IDF
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs
import pyspark.sql.functions as f
import functools as tools
from pymongo import MongoClient
from pyspark.ml import Pipeline

def database_extraction():
    mongo_uri = 'mongodb://localhost:27017/Warehouse' # connection string
    collections = ['Tags', 'UserDetail', 'Reviews', 'MetaData', 'PriceAndRating'] # importing all these collections
    client = MongoClient(mongo_uri)

    data_dict = {}
    for collection_name in collections:
        collection = client['Warehouse'][collection_name]
        data_list = list(collection.find())
        data_dict[collection_name] = data_list
    client.close()
    dfs = {key: pd.DataFrame(value) for key, value in data_dict.items()}
    combined_df = pd.concat(dfs.values(), axis=1)
    combined_df.head()
    x = combined_df['product_id']
    combined_df = combined_df.drop(['product_id'], axis=1)
    combined_df['product_id'] = x.iloc[:, 0]
    combined_df.columns
    columns_to_keep = ['product_id', 'user_id', 'user_name', 'review_id', 'review_title', 
                       'review_content','BrandName', 'product_name', 'img_link', 'product_link',
                       'discounted_price', 'actual_price', 'discount_percentage', 'rating',
                       'rating_count', 'CategoryTag_1', 'CategoryTag_2', 'CategoryTag_3', 
                       'CategoryTag_4', 'CategoryTag_5', 'CategoryTag_6', 'CategoryTag_7', 'about_product']
    data = combined_df[columns_to_keep]
    category_columns = ['CategoryTag_1', 'CategoryTag_2', 'CategoryTag_3', 'CategoryTag_4', 'CategoryTag_5', 'CategoryTag_6', 'CategoryTag_7']
    data['category'] = data[category_columns].apply(lambda row: '|'.join(row.dropna().astype(str)), axis=1)
    data = data.drop(category_columns, axis=1)
    data.to_csv("data.csv")
    return data
    
def model_pipeline_transformation(df):
    columns = ['product_name', 'about_product', 'category']
    minDFs = {'product_name':2.0, 'about_product':4.0, 'category':4.0}
    preProcStages = []
    for col in columns:
        regexTokenizer = RegexTokenizer(gaps=False, pattern='\w+', inputCol=col, outputCol=col+'Token')
        stopWordsRemover = StopWordsRemover(inputCol=col+'Token', outputCol=col+'SWRemoved')
        countVectorizer = CountVectorizer(minDF=minDFs[col], inputCol=col+'SWRemoved', outputCol=col+'TF')
        idf = IDF(inputCol=col+'TF', outputCol=col+'IDF') 
        preProcStages += [regexTokenizer, stopWordsRemover, countVectorizer, idf]
    pipeline = Pipeline(stages=preProcStages)
    model = pipeline.fit(df)
    data = model.transform(df)
    data = data.select('product_id', 'product_nameIDF', 'about_productIDF','categoryIDF')
    return data

def cosine_similarityy(X, Y):
    denom = X.norm(2) * Y.norm(2)
    if denom == 0.0:
        return -1.0
    else:
        return X.dot(Y) / float(denom)

def search(string,DF_pandas):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(DF_pandas['product_name'])
    string_vector = vectorizer.transform([string])
    cosine_sim = cs(string_vector, vectors)
    cos=[]
    for i in range(len(DF_pandas['product_name'])):
        cos.append(cosine_sim[0][i])
    DF_pandas['cosine_sim']=cos
    DF_pandas=DF_pandas.sort_values(by=["cosine_sim"], ascending=False)
    return(DF_pandas.head(20))
    
def Sort(sub_li):
    l = len(sub_li)
    for i in range(0, l):
        for j in range(0, l-i-1):
            if (sub_li[j][1] < sub_li[j + 1][1]):
                tempo = sub_li[j]
                sub_li[j]= sub_li[j + 1]
                sub_li[j + 1]= tempo
    return sub_li

def product_name_recommendation(x,TDF):
    gProd1 = modeldata.filter(modeldata['product_id'] == x).collect()[0]
    l=[]
    for row in data_collect:
        c=cosine_similarityy(row['product_nameIDF'], gProd1['product_nameIDF'])
        i=row['product_id']
        l+=[c]
    tit=[]
    for i in range(len(TDF['product_name'])):
        tit.append(l[i])
    TDF['titlesim']=tit
    TDF1=TDF.sort_values(by=["titlesim"], ascending=False)
    TDF1=TDF1.iloc[1:,:]
    return(TDF1)
    
def toprated(ci,RDF):
    return(description_recommendation(ci,RDF).sort_values(by=["rating"], ascending=False))
    
def description_recommendation(x,RDF):
    gProd1 = modeldata.filter(modeldata['product_id'] == x).collect()[0]
    l=[]
    for row in data_collect:
        c=cosine_similarityy(row['about_productIDF'], gProd1['about_productIDF'])
        i=row['product_id']
        l+=[(c)]
    rec=[]
    for i in range(len(RDF['product_name'])):
        rec.append(l[i])
    RDF['sim']=rec
    RDF1=RDF.sort_values(by=["sim"], ascending=False)
    RDF1=RDF1.iloc[1:,:]
    return(RDF1)
    
def category_recommendation(x,RDF):
    gProd1 = modeldata.filter(modeldata['index'] == x).collect()[0]
    l=[]
    for row in data_collect:
        c=cosine_similarityy(row['categoryIDF'], gProd1['categoryIDF'])
        i=row['index']
        l+=[(c)]
    rec=[]  
    for i in range(len(RDF['product_name'])):
        rec.append(l[i])
    RDF['sim']=rec
    RDF1=RDF.sort_values(by=["sim"], ascending=False)
    RDF1=RDF1.iloc[1:,:]
    return(RDF1.head(50))

# databasedata=database_extraction()
spark = SparkSession.builder.master("local[1]").appName("SparkByExamples.com").getOrCreate()
data = database_extraction()
# data = spark.read.csv('Amazon_Kaggle_Dataset.csv',header=True, inferSchema=True )

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['category'] = data[category_columns].apply(lambda row: '|'.join(row.dropna().astype(str)), axis=1)


In [23]:
data.head()

Unnamed: 0,product_id,user_id,user_name,review_id,review_title,review_content,BrandName,product_name,img_link,product_link,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,category
0,B08L12N5H1,"AGTDSNT2FKVYEPDPXAA673AIS44A,AER2XFSWNN4LAUCJ5...","Divya,Dr Nefario,Deekshith,Preeti,Prasanth R,P...","R2KKTKM4M9RDVJ,R1O692MZOBTE79,R2WRSEWL56SOS4,R...","Decent product,doesn't pick up sand,Ok ok,Must...","Does the job well,doesn't work on sand. though...",Eureka,Eureka Forbes car Vac 100 Watts Powerful Sucti...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Eureka-Forbes-Vacuum-Cle...,2099.0,2499.0,16,|,992,No Installation is provided for this product|1...,"Home&Kitchen|Kitchen&HomeAppliances|Vacuum,Cle..."
1,B0BQRJ3C47,Abdul Gafur,RQXD5SAMMPC6L,Awesome Product,Quick delivery.Awesome ProductPacking was good...,https://m.media-amazon.com/images/I/31-q0xhaTA...,REDTECH,"REDTECH USB-C to Lightning Cable 3.3FT, [Apple...",https://www.amazon.in/REDTECH-Lightning-Certif...,,249.0,999.0,75,5.0,💎[The Fastest Charge] - This iPhone USB C cabl...,AGJC5O5H5BBXWUV7WRIEIOOR3TVQ,Computers&Accessories|Accessories&Peripherals|...
2,B0BP7XLX48,"AF7EOXYL5K36BDP6PXF6K2TL5TPA,AEN7NV2P5WNHM7EXC...","Amazon Customer,kamal sahota,Pankaj,luvsmplcty...","R1L2JNO4Y3BHYF,R2346F22YLZ9IG,R3A4GAQTCPE5U7,R...","Good,Superb quality,Good products nice one,Wo...",Product is good in quality. Working good with ...,Syncwire,Syncwire LTG to USB Cable for Fast Charging Co...,https://m.media-amazon.com/images/I/317OoQfs1g...,https://www.amazon.in/Syncwire-Cable-Charging-...,399.0,1999.0,80,5.0,5,This sturdy and durable cable made of tpe and ...,Computers&Accessories|Accessories&Peripherals|...
3,B09ZHCJDP1,"AFLLEPVLIAH2DFSHAZ77KWFM72ZA,AHY2YZWK63CNZ626M...","Rambeer kumar,Ramesh,Digambar Shelke,awadhesh ...","R76XPXMKXLWKH,R23S77AWPH5FP5,RK7Q6W5FOPESC,R2X...","Very responsive and stylish mouse,Simply Aweso...",I really like this wireless mouse it has becam...,Amazon,Amazon Basics Wireless Mouse | 2.4 GHz Connect...,https://m.media-amazon.com/images/I/31+Rg6Z46d...,https://www.amazon.in/Wireless-Connection-Batt...,499.0,1000.0,50,5.0,23,Reliable wireless connection up to 10m|Advance...,Computers&Accessories|Accessories&Peripherals|...
4,B0BQ3K23Y1,"AGRJZJKWKIE573KM5FWPOH4F7YCA,AEHV4VOLDQX5XYA42...","Amazon Customer,Neha Mehta,Nischal Agarwal,tha...","R3907SDNN9VR5Y,R1NNMXA39722T8,RXQNT49DKJ26S,R2...","Oratech Best Coffee Frother,Great,My review ab...","Overall, I love this Oratech Coffee frother an...",Oratech,"Oratech Coffee Frother electric, milk frother ...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Oratech-electric-cappucc...,279.0,499.0,44,4.8,28,-Make delicious milk foam creamer for your dri...,Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...


In [24]:
data = spark.createDataFrame(data).withColumn("index", lit(1) + monotonically_increasing_id())
modeldata = model_pipeline_transformation(data)
DF_pandas = data.toPandas()
TDF=data.toPandas()
RDF=data.toPandas()
data_collect = modeldata.collect()

In [27]:
print("Select one of the option")
print("1. Search ")
print("2. Product Based Recommendation")
print("3. Product Summary Based Recommendation")
print("4. Rating Based")
print("5. Category Based Recommendation")
choice=int(input("Selected option is:"))
if choice==1:
    string=input("Please enter product name:")
    searchdata=search(string,DF_pandas)
    display(searchdata[['product_name','category','actual_price','discounted_price','rating','rating_count','about_product','cosine_sim']])
elif choice==2:
    x=input("Please enter the product ID:")
    name=product_name_recommendation(x,TDF)
    display(name['product_id'])
elif choice==3:
    ci=input("Please enter the product ID:")
    description=description_recommendation(ci,RDF)
    display(description[['product_name','category','actual_price','discounted_price','rating','rating_count','about_product']])
elif choice==4:
    ci=input("Please enter the product ID:")
    description_recommendation_rating=toprated(ci,RDF)
    display(description_recommendation_rating[['product_name','category','actual_price','discounted_price','rating','rating_count','about_product']])
elif choice==5:
    ci=int(input("Please enter the product ID:"))
    category=category_recommendation(ci,RDF)
    display(category[['product_name','category','actual_price','discounted_price','rating','rating_count','about_product',]])
else:
    print("Please select correct choice")

Select one of the option
1. Search 
2. Product Based Recommendation
3. Product Summary Based Recommendation
4. Rating Based
5. Category Based Recommendation


Unnamed: 0,product_name,category,actual_price,discounted_price,rating,rating_count,about_product
1,"REDTECH USB-C to Lightning Cable 3.3FT, [Apple...",Computers&Accessories|Accessories&Peripherals|...,999.0,249.0,5.0,💎[The Fastest Charge] - This iPhone USB C cabl...,AGJC5O5H5BBXWUV7WRIEIOOR3TVQ
2,Syncwire LTG to USB Cable for Fast Charging Co...,Computers&Accessories|Accessories&Peripherals|...,1999.0,399.0,5.0,5,This sturdy and durable cable made of tpe and ...
3,Amazon Basics Wireless Mouse | 2.4 GHz Connect...,Computers&Accessories|Accessories&Peripherals|...,1000.0,499.0,5.0,23,Reliable wireless connection up to 10m|Advance...
6,"Instant Pot Air Fryer, Vortex 2QT, Touch Contr...",Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...,20049.0,4995.0,4.8,3964,VOLTAGE (230 Volts): Instant Vortex Air Fryer ...
5,Swiffer Instant Electric Water Heater Faucet T...,"Home&Kitchen|Heating,Cooling&AirQuality|WaterH...",1999.0,1439.0,4.8,53803,✔Quick Electric Hot Water Tap Heating tube: hi...
...,...,...,...,...,...,...,...
1346,SHREENOVA ID116 Plus Bluetooth Fitness Smart W...,Electronics|WearableTechnology|SmartWatches,1999.0,281.0,2.8,87,"✅ All-day activity tracking: Track steps, dist..."
1347,MR. BRAND Portable USB Juicer Electric USB Jui...,Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...,2199.0,499.0,2.8,109,Portable & Convenient to Charge: This little b...
1348,Green Tales Heat Seal Mini Food Sealer-Impulse...,Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...,300.0,161.0,2.6,24,Sealing Machine use for snack food saver stora...
1349,"Personal Size Blender, Portable Blender, Batte...",Home&Kitchen|Kitchen&HomeAppliances|SmallKitch...,1499.0,669.0,2.3,13,✔【Easy to carry around】- This handheld blender...
