In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl  
import matplotlib.pyplot as plt  
import seaborn as sns  
import sklearn
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [63]:
data = pd.read_csv('Womens Clothing E-Commerce Reviews.csv')

In [64]:
data.shape

(23486, 11)

In [65]:
data.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [66]:
data.describe()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count
count,23486.0,23486.0,23486.0,23486.0,23486.0,23486.0
mean,11742.5,918.118709,43.198544,4.196032,0.822362,2.535936
std,6779.968547,203.29898,12.279544,1.110031,0.382216,5.702202
min,0.0,0.0,18.0,1.0,0.0,0.0
25%,5871.25,861.0,34.0,4.0,1.0,0.0
50%,11742.5,936.0,41.0,5.0,1.0,1.0
75%,17613.75,1078.0,52.0,5.0,1.0,3.0
max,23485.0,1205.0,99.0,5.0,1.0,122.0


In [67]:
data.isnull().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [68]:
data.drop(data.columns[0],inplace=True, axis=1)
data.isnull().sum()

Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

In [69]:
for x in ["Title", "Division Name", "Department Name", "Class Name", "Review Text"]:
    data = data[data[x].notnull()] 

In [70]:
data.isnull().sum()

Clothing ID                0
Age                        0
Title                      0
Review Text                0
Rating                     0
Recommended IND            0
Positive Feedback Count    0
Division Name              0
Department Name            0
Class Name                 0
dtype: int64

In [71]:
data.shape

(19662, 10)

In [72]:
data.head()

Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses
5,1080,49,Not for the very petite,"I love tracy reese dresses, but this one is no...",2,0,4,General,Dresses,Dresses
6,858,39,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,5,1,1,General Petite,Tops,Knits


In [73]:
data.describe()

Unnamed: 0,Clothing ID,Age,Rating,Recommended IND,Positive Feedback Count
count,19662.0,19662.0,19662.0,19662.0,19662.0
mean,921.297274,43.260808,4.183145,0.818177,2.652477
std,200.227528,12.258122,1.112224,0.385708,5.834285
min,1.0,18.0,1.0,0.0,0.0
25%,861.0,34.0,4.0,1.0,0.0
50%,936.0,41.0,5.0,1.0,1.0
75%,1078.0,52.0,5.0,1.0,3.0
max,1205.0,99.0,5.0,1.0,122.0


In [74]:
data.describe().T.drop('count',axis=1)

Unnamed: 0,mean,std,min,25%,50%,75%,max
Clothing ID,921.297274,200.227528,1.0,861.0,936.0,1078.0,1205.0
Age,43.260808,12.258122,18.0,34.0,41.0,52.0,99.0
Rating,4.183145,1.112224,1.0,4.0,5.0,5.0,5.0
Recommended IND,0.818177,0.385708,0.0,1.0,1.0,1.0,1.0
Positive Feedback Count,2.652477,5.834285,0.0,0.0,1.0,3.0,122.0


In [75]:
data['Clothing ID'].unique()

array([1077, 1049,  847, ...,  721,  262,  522])

In [76]:
data['Clothing ID'].value_counts()

1078    871
862     658
1094    651
1081    487
829     452
       ... 
201       1
1200      1
525       1
1127      1
522       1
Name: Clothing ID, Length: 1095, dtype: int64

In [77]:
data = data[['Clothing ID', 'Title', 'Review Text','Rating']]

In [18]:
data = data.dropna(subset=['Review Text'])

In [19]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['Review Text'])

In [20]:
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [78]:
def recommend_items(item_id, cosine_similarities=cosine_similarities, data=data, top_n=5):
    indices = pd.Series(data.index, index=data['Clothing ID']).drop_duplicates()
    idx = indices[item_id]
    similarity_scores = list(enumerate(cosine_similarities[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_items_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    return data.iloc[top_items_indices]

In [89]:
data = data[['Clothing ID', 'Title', 'Review Text']]
data = data.dropna(subset=['Review Text'])
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['Review Text'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

def calculate_accuracy(item_id, cosine_similarities=cosine_similarities, data=data, top_n=5, ground_truth=None):
    indices = pd.Series(data.index, index=data['Clothing ID']).drop_duplicates()
    idx = indices[item_id]
    similarity_scores = list(enumerate(cosine_similarities[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_items_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    recommended_items = data.iloc[top_items_indices]['Clothing ID'].tolist()


In [95]:
item_id = 1200  # Example item ID
recommended_items = recommend_items(item_id)
print(recommended_items)

       Clothing ID                 Title  \
3747          1080          Great dress!   
5553          1081      Beautiful color!   
14390         1158           Doesn't fit   
13134         1094  Cute and comfortable   
13823         1078                  Love   

                                             Review Text  Rating  
3747   Just received my dress today and i am pleased ...       5  
5553   This dress is happy! it is easy to wear and it...       4  
14390  Fun, sexy underwear but rides up in the front....       2  
13134  This dress is easy to dress up or down, and is...       5  
13823                      Love this dress and the color       5  


In [87]:
def calculate_accuracy(item_id, cosine_similarities=cosine_similarities, data=data, top_n=5, ground_truth=None):
    recommended_items = recommend_items(item_id, cosine_similarities, data, top_n)
    relevant_items = ground_truth[item_id]  # List of relevant items for the given item ID

    precision = 0.0
    num_relevant_items = len(relevant_items)
    num_correct_predictions = 0

    for i in range(len(recommended_items)):
        if recommended_items['Clothing ID'].iloc[i] in relevant_items:
            num_correct_predictions += 1
            precision += num_correct_predictions / (i + 1)

    if num_relevant_items > 0:
        precision /= num_relevant_items

    return precision

# Example ground truth data with ratings
ground_truth_data = {
    22: [5, 4, 5]  # Example ratings for item ID 22
}

item_id = 22  # Example item ID
map_score = calculate_accuracy(item_id, cosine_similarities, data, top_n=5, ground_truth=ground_truth_data)
print("MAP Score:", map_score)
item_id = 22  # Example item ID
recommended_items = recommend_items(item_id)
print(recommended_items)

MAP Score: 0.0
       Clothing ID                                              Title  \
1285           979                                 Cozy amazingness!!   
20625         1112                                Amazing soft jacket   
2573           864                                         So pretty!   
20747         1037                                        Really nice   
20940          961  This vest will work in the warmer winter in texas   

                                             Review Text  Rating  
1285   This beauty arrived in my store today...and de...       5  
20625  This jacket is so adorable on! i have no idea ...       5  
2573   I bought this in blue and can't wait to wear i...       5  
20747  I have been looking for leather or faux leathe...       5  
20940  The sherpa vest i received from retailer was t...       5  


In [83]:
ground_truth_data = {
    22: [5, 5, 5]  # Example ratings for item ID 22
}

In [84]:
item_id = 22  # Example item ID
precision, recall = calculate_accuracy(item_id, cosine_similarities, data, top_n=5)
print("Precision:", precision)
print("Recall:", recall)

Precision: 0.0
Recall: 0.0


MAP Score: 0.0


In [25]:
data = data[['Clothing ID', 'Title', 'Review Text']]
data = data.dropna(subset=['Review Text'])
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['Review Text'])
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
def recommend_items(item_id, cosine_similarities=cosine_similarities, data=data, top_n=5):
    indices = pd.Series(data.index, index=data['Clothing ID']).drop_duplicates()
    idx = indices[item_id]
    similarity_scores = list(enumerate(cosine_similarities[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_items_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    return data.iloc[top_items_indices]
item_id = 22  # Example item ID
recommended_items = recommend_items(item_id)
print(recommended_items)

       Clothing ID                                              Title  \
1285           979                                 Cozy amazingness!!   
20625         1112                                Amazing soft jacket   
2573           864                                         So pretty!   
20747         1037                                        Really nice   
20940          961  This vest will work in the warmer winter in texas   

                                             Review Text  
1285   This beauty arrived in my store today...and de...  
20625  This jacket is so adorable on! i have no idea ...  
2573   I bought this in blue and can't wait to wear i...  
20747  I have been looking for leather or faux leathe...  
20940  The sherpa vest i received from retailer was t...  


In [26]:
item_id = 22  # Example item ID
recommended_items = recommend_items(item_id)
print(recommended_items)

       Clothing ID                                              Title  \
1285           979                                 Cozy amazingness!!   
20625         1112                                Amazing soft jacket   
2573           864                                         So pretty!   
20747         1037                                        Really nice   
20940          961  This vest will work in the warmer winter in texas   

                                             Review Text  
1285   This beauty arrived in my store today...and de...  
20625  This jacket is so adorable on! i have no idea ...  
2573   I bought this in blue and can't wait to wear i...  
20747  I have been looking for leather or faux leathe...  
20940  The sherpa vest i received from retailer was t...  


In [96]:
def precision_at_k(actual, predicted, k):
    # Take only the top k predicted items
    predicted_k = predicted[:k]
    
    # Count the number of correctly recommended items
    correct_items = set(actual) & set(predicted_k)
    
    # Calculate precision at k
    precision = len(correct_items) / k
    return precision

def recall_at_k(actual, predicted, k):
    # Take only the top k predicted items
    predicted_k = predicted[:k]
    
    # Count the number of correctly recommended items
    correct_items = set(actual) & set(predicted_k)
    
    # Calculate recall at k
    recall = len(correct_items) / len(actual)
    return recall

def average_precision(actual, predicted):
    # Calculate precision at each position
    precision = []
    num_correct = 0
    for i, item in enumerate(predicted):
        if item in actual:
            num_correct += 1
            precision.append(num_correct / (i + 1))
    
    # Calculate mean average precision
    if len(precision) == 0:
        return 0
    else:
        return sum(precision) / len(actual)

# Example ground truth and predicted recommendations
ground_truth = [1, 2, 3, 4, 5]
predicted_recommendations = [3, 4, 5, 6, 7]

# Calculate precision at k
k = 3
precision = precision_at_k(ground_truth, predicted_recommendations, k)
print("Precision at", k, ":", precision)

# Calculate recall at k
recall = recall_at_k(ground_truth, predicted_recommendations, k)
print("Recall at", k, ":", recall)

# Calculate mean average precision
map_score = average_precision(ground_truth, predicted_recommendations)
print("Mean Average Precision (MAP):", map_score)


Precision at 3 : 1.0
Recall at 3 : 0.6
Mean Average Precision (MAP): 0.6


In [34]:
data['Sentiment'] = data['Review Text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [35]:
def get_sentiment_label(score):
    if score > 0:
        return 'Positive'
    elif score < 0:
        return 'Negative'
    else:
        return 'Neutral'

In [36]:
data['Sentiment Label'] = data['Sentiment'].apply(get_sentiment_label)

In [37]:
sentiment_counts = data['Sentiment Label'].value_counts()
print(sentiment_counts)

Positive    18443
Negative     1141
Neutral        78
Name: Sentiment Label, dtype: int64


In [38]:
def get_sentiment_polarity(text):
    return TextBlob(text).sentiment.polarity

In [39]:
data['Sentiment Polarity'] = data['Review Text'].apply(get_sentiment_polarity)

In [40]:
sentiment_by_clothing = data.groupby('Clothing ID')['Sentiment Polarity'].mean().reset_index()

In [41]:
print(sentiment_by_clothing)

      Clothing ID  Sentiment Polarity
0               1            0.358650
1               2           -0.145525
2               4            0.282917
3               5            0.445833
4               7            0.214286
...           ...                 ...
1090         1200           -0.078704
1091         1202            0.270126
1092         1203            0.320286
1093         1204            0.226397
1094         1205            0.368519

[1095 rows x 2 columns]


In [44]:
import nltk

In [45]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/faizanarshad/nltk_data...


True

In [46]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Initialize the SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Perform sentiment analysis and add subjectivity and polarity indices
data['Sentiment'] = data['Review Text'].apply(lambda x: sia.polarity_scores(x))
data['Polarity'] = data['Sentiment'].apply(lambda x: x['compound'])
data['Subjectivity'] = data['Sentiment'].apply(lambda x: 1 if x['compound'] >= 0 else 0)

# Save the updated DataFrame to the file
data.to_csv('updated_data.csv', index=False)

In [47]:
data_1 = pd.read_csv('updated_data.csv')

In [99]:
data_1.head(10)

Unnamed: 0,Clothing ID,Title,Review Text,Sentiment,Sentiment Label,Sentiment Polarity,Polarity,Subjectivity
0,1077,Some major design flaws,I had such high hopes for this dress and reall...,"{'neg': 0.027, 'neu': 0.792, 'pos': 0.181, 'co...",Positive,0.073675,0.9427,1
1,1049,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...","{'neg': 0.226, 'neu': 0.34, 'pos': 0.434, 'com...",Positive,0.55,0.5727,1
2,847,Flattering shirt,This shirt is very flattering to all due to th...,"{'neg': 0.0, 'neu': 0.7, 'pos': 0.3, 'compound...",Positive,0.512891,0.9291,1
3,1080,Not for the very petite,"I love tracy reese dresses, but this one is no...","{'neg': 0.0, 'neu': 0.853, 'pos': 0.147, 'comp...",Positive,0.17875,0.9419,1
4,858,Cagrcoal shimmer fun,I aded this in my basket at hte last mintue to...,"{'neg': 0.023, 'neu': 0.881, 'pos': 0.096, 'co...",Positive,0.13375,0.8004,1
5,858,"Shimmer, surprisingly goes with lots","I ordered this in carbon for store pick up, an...","{'neg': 0.046, 'neu': 0.901, 'pos': 0.053, 'co...",Positive,0.171635,-0.0909,0
6,1077,Flattering,I love this dress. i usually get an xs but it ...,"{'neg': 0.0, 'neu': 0.818, 'pos': 0.182, 'comp...",Positive,0.0025,0.7175,1
7,1077,Such a fun dress!,"I'm 5""5' and 125 lbs. i ordered the s petite t...","{'neg': 0.12, 'neu': 0.753, 'pos': 0.126, 'com...",Positive,0.2042,-0.3724,0
8,1077,Dress looks like it's made of cheap material,Dress runs small esp where the zipper area run...,"{'neg': 0.044, 'neu': 0.877, 'pos': 0.079, 'co...",Negative,-0.097149,0.3578,1
9,1095,Perfect!!!,More and more i find myself reliant on the rev...,"{'neg': 0.026, 'neu': 0.899, 'pos': 0.075, 'co...",Positive,0.244156,0.552,1


In [97]:
ground_truth_data = {22: [25, 42, 56, 78, 91]}  # Example ground truth data

def calculate_accuracy(item_id, cosine_similarities, data, top_n, ground_truth=None):
    indices = pd.Series(data.index, index=data['Clothing ID']).drop_duplicates()
    idx = indices[item_id]
    similarity_scores = list(enumerate(cosine_similarities[idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_items_indices = [i[0] for i in similarity_scores[1:top_n+1]]
    recommended_items = data.iloc[top_items_indices]['Clothing ID'].tolist()
    
    precision = 0.0
    recall = 0.0
    f1_score = 0.0
    
    if ground_truth and item_id in ground_truth:
        relevant_items = ground_truth[item_id]  # List of relevant items for the given item ID
        
        # Calculate accuracy metrics (e.g., precision, recall, F1 score) using recommended_items and relevant_items
        # ...
        # Compute precision, recall, and F1-score based on the recommended_items and relevant_items

    return precision, recall, f1_score

item_id = 19662  # Example item ID
precision, recall, f1_score = calculate_accuracy(item_id, cosine_similarities, data, top_n=5, ground_truth=ground_truth_data)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)


KeyError: 19662