https://www.datacamp.com/community/tutorials/recommender-systems-python

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import random

data = pd.read_csv("SQL_data.csv", encoding= 'unicode_escape')

In [2]:
total_reviews = []
for x in range(len(data['product_id'])):
    a = (random.randint(1,200))
    total_reviews.append(a)
    
data['Total_reviews'] = total_reviews

C = data['popular'].mean()

m = data['Total_reviews'].quantile(0.90)

reviews = data.copy().loc[data['Total_reviews'] >= m]

In [3]:
def weighted_rating(x, m=m, C=C):
    v = x['popular']
    R = x['Total_reviews']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [4]:
data['score'] = data.apply(weighted_rating, axis=1)

In [5]:
#Sort products based on score calculated above
data = data.sort_values('score', ascending=False)

#Print the top 15 products
data[['product_id', 'popular', 'Total_reviews', 'score']].head(10)

Unnamed: 0,product_id,popular,Total_reviews,score
1900,1901,5,200,8.298569
1258,1259,5,200,8.298569
2730,2731,5,200,8.298569
2700,2701,5,200,8.298569
1289,1290,5,200,8.298569
1693,1694,5,199,8.271831
2699,2700,5,199,8.271831
1355,1356,5,199,8.271831
3377,3378,5,199,8.271831
2769,2770,5,198,8.245093


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
data['name'] = data['name'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(data['name'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(3651, 687)

In [7]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
indices = pd.Series(data.index, index=data['product_id']).drop_duplicates()

In [9]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the product that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all products with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 3 most similar products
    sim_scores = sim_scores[0:3]

    # Get the product indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 3 most similar products
    return data['product_id'].iloc[movie_indices]

In [10]:
aantal_recommendations = 3

In [11]:
recommendations = []
productid = []
for x,i in enumerate(data['product_id']):
    a = get_recommendations(i)
    a = a.to_numpy()
    for z, w in enumerate(a):
        recommendations.append(a[z])
    b = x + 1
    for y in range(aantal_recommendations):
        productid.append(b)
    #recommendations.append(a)

In [15]:
dataframe = pd.DataFrame({

    'Product_id': productid,
    'Recommendation_id': recommendations

})
print(dataframe)

       Product_id  Recommendation_id
0               1               2309
1               1               3526
2               1               3427
3               2               1262
4               2               2479
...           ...                ...
10948        3650               1696
10949        3650                479
10950        3651               1441
10951        3651                224
10952        3651               2659

[10953 rows x 2 columns]


In [16]:
dataframe.to_csv('recommendation_enginge.csv', index = False, header=True)