In [49]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
# Step 1: Load and preprocess the dataset
data = pd.read_csv('cleaned_data.csv')

In [51]:
data.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Polarity,Subjectivity,Sentiment Label
0,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,0.073675,0.356294,Positive
1,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,0.55,0.625,Positive
2,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,0.512891,0.56875,Positive
3,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses,0.17875,0.533125,Positive
4,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,1,General Petite,Tops,Knits,0.13375,0.607778,Positive


In [52]:
data = data[['Clothing ID', 'Age', 'Recommended IND', 'Review Text', 'Rating']]
data.dropna(subset=['Review Text'], inplace=True)
data.reset_index(drop=True, inplace=True)
data['Review Text'] = data['Review Text'].str.lower()

In [53]:
# Step 2: Create the item-user matrix
item_user_matrix = data.pivot_table(index='Clothing ID', columns='Age', values='Rating', fill_value=0)
item_user_matrix = item_user_matrix.to_numpy()

In [54]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Assuming 'data' DataFrame contains the necessary data with columns 'Clothing ID', 'Age', and 'Rating'

# Step 1: Get the item IDs from the 'Clothing ID' column in the 'data' DataFrame
item_ids = data['Clothing ID'].unique()

# Step 2: Create the item-user matrix using pivot_table
item_user_matrix = data.pivot_table(index='Clothing ID', columns='Age', values='Rating', fill_value=0)

# Convert the pivot table to a NumPy array for cosine similarity
item_user_matrix = item_user_matrix.to_numpy()

# Step 3: Calculate cosine similarity
item_similarity = cosine_similarity(item_user_matrix)

# Convert the similarity matrix into a DataFrame with row and column labels
item_similarity_df = pd.DataFrame(
    item_similarity,
    index=item_ids,
    columns=item_ids
)

# Print the item similarity DataFrame
print(item_similarity_df)



          1077      1049      847   1080  858       1095      767   1065  \
1077  1.000000  0.000000  0.000000   0.0   0.0  0.148478  0.000000   0.0   
1049  0.000000  1.000000  1.000000   0.0   0.0  0.000000  0.000000   0.0   
847   0.000000  1.000000  1.000000   0.0   0.0  0.000000  0.000000   0.0   
1080  0.000000  0.000000  0.000000   1.0   1.0  0.000000  0.000000   0.0   
858   0.000000  0.000000  0.000000   1.0   1.0  0.000000  0.000000   0.0   
...        ...       ...       ...   ...   ...       ...       ...   ...   
630   0.272166  0.000000  0.000000   0.0   0.0  0.000000  0.000000   0.0   
181   0.000000  0.302372  0.302372   0.0   0.0  0.000000  0.000000   0.0   
721   0.000000  0.000000  0.000000   0.0   0.0  0.000000  0.421076   0.0   
262   0.000000  0.000000  0.000000   1.0   1.0  0.000000  0.000000   0.0   
522   0.000000  0.000000  0.000000   0.0   0.0  0.000000  1.000000   0.0   

      853       1120  ...      227       487       387   640       345   \
1077   0.0  

In [55]:
# Step 4: Feature engineering - Process text data for 'Review Text'
vectorizer = CountVectorizer(stop_words='english')
review_matrix = vectorizer.fit_transform(data['Review Text'])
review_similarity = cosine_similarity(review_matrix)
review_similarity_df = pd.DataFrame(review_similarity, index=data.index, columns=data.index)

In [56]:
# Step 5: Combine the similarity matrices using weighted average (you can adjust the weights as needed)
alpha = 0.7
combined_similarity = alpha * item_similarity_df + (1 - alpha) * review_similarity_df

In [58]:
# Step 6: Define the recommendation function
def get_recommendations(clothing_id, top_n=5):
    # Get the combined similarity scores for the given clothing_id
    sim_scores = combined_similarity[clothing_id]

    # Get the top_n most similar clothing items
    top_items = sim_scores.nlargest(top_n + 1).drop(clothing_id)
    
    # Get the clothing details for the top items
    recommended_items = data[data['Clothing ID'].isin(top_items.index)]

    return recommended_items

# Step 7: Get recommendations for a specific item
clothing_id = 22  # Replace with the desired clothing ID

# Get recommendations using the defined function
recommendations = get_recommendations(clothing_id)

# Step 8: Remove duplicate rows based on 'Clothing ID' from the recommendations DataFrame
recommendations = recommendations.drop_duplicates(subset='Clothing ID', keep='first')

# Step 9: Print the recommendations for the specific item
print("Recommendations for Clothing ID", clothing_id)
print(recommendations[['Clothing ID', 'Review Text', 'Rating']])


Recommendations for Clothing ID 22
       Clothing ID                                        Review Text  Rating
768            154  i love the design and pattern of this top. it ...       4
1555           371  cute and very soft, but the elastic wasn't sew...       1
2349          1093  i was very excited about receiving this dress....       3
8260           794  i love this sweatshirt. it's super chic and se...       4
10871          808  i love this! very comfortable & cute dress. tr...       5
