In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

In [2]:
path = os.path.join(os.getcwd(), 'Data', 'input_search_DB.csv')
df = pd.read_csv(path)
df = df[['order_id', 'order_customer_name', 'product_name', 'part_type_name']]
df.head()

Unnamed: 0,order_id,order_customer_name,product_name,part_type_name
0,2200006,得意先_1,A　2023年3月号 定期演奏会,['本文1']
1,2107551,得意先_7,アーティストリスト2022年,"['本文1', '本文1', '本文2', '本文2', '表紙1', '表紙1']"
2,2200898,得意先_8,ミュージアムリーフレット,['本文']
3,2202767,得意先_148,A小学校　2024学校案内パンフレット,"['本文1', '本文2', '表紙1', '表紙2']"
4,2203087,得意先_14,A社統合報告書2022（英文）,"['本文1', '表紙1']"


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   order_id             98 non-null     int64 
 1   order_customer_name  98 non-null     object
 2   product_name         98 non-null     object
 3   part_type_name       98 non-null     object
dtypes: int64(1), object(3)
memory usage: 3.2+ KB


In [4]:
sample_row = pd.Series({'order_id': 2204908, 'order_customer_name': '得意先_8', 'product_name': '「ABS」展B2ポスター', 'part_type_name': '[本文1, 本文2, 表紙1, 表紙2]'})

df_with_sample = pd.concat([df, sample_row.to_frame().T], ignore_index=True)

In [5]:
df_with_sample.loc[df_with_sample['order_id'] == 2204908]

Unnamed: 0,order_id,order_customer_name,product_name,part_type_name
98,2204908,得意先_8,「ABS」展B2ポスター,"[本文1, 本文2, 表紙1, 表紙2]"


In [6]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df_with_sample['product_name'])
cosine_similarities = cosine_similarity(tfidf_matrix[-1:], tfidf_matrix[:-1]).flatten()
df['similarity'] = cosine_similarities

In [7]:
result_df = df.sort_values(by='similarity', ascending=False).head(5)
result_df.head()

Unnamed: 0,order_id,order_customer_name,product_name,part_type_name,similarity
40,2204439,得意先_152,「ABS」展B2ポスター,"['再校', '再校', '初校', '本文1']",1.0
41,2204440,得意先_152,「ABS」展チラシ,"['再校', '再校', '初校', '本文1']",0.467065
2,2200898,得意先_8,ミュージアムリーフレット,['本文'],0.0
3,2202767,得意先_148,A小学校　2024学校案内パンフレット,"['本文1', '本文2', '表紙1', '表紙2']",0.0
4,2203087,得意先_14,A社統合報告書2022（英文）,"['本文1', '表紙1']",0.0
