In [1]:
import sys
sys.path.append('../src')
from data_processing import loading_data, basic_cleaning
from feature_engineering import creating_features, extracting_brands, creating_rating

In [2]:
reviews, products = loading_data()
cleaning_reviews = basic_cleaning(reviews)
merging_data = cleaning_reviews.merge(products, on='key', how='left')

Started with 21674 reviews
After cleaning: 21674 reviews


In [3]:
merging_data['clean_text'] = merging_data['clean_text'].astype(str)

final_data = creating_features(merging_data)
final_data = extracting_brands(final_data)

final_data['sentiment_category'] = final_data['rating'].apply(creating_rating)

Extracting text features...
Extracting Spacy features...
Extracting VADER features...
clean_text sample: super good, don't get me wrong. but i came for the caramel and brownies, not the sweet cream. the packaging made it seem like brownies were packed and bountiful *crying frowny emoji* i'd say the taste of this was amazing, but the ratio of brownie to sweet cream was disappointing. liked it regardless but probably won't buy again simply because it didn't live up to its promising package. i'll find another one that has a better ratio and wayyy more yummy chewy brownies. overall, good flavor, texture, idea, and brownies. not so great caramel/sweet cream/ brownie ratio. just add more brownies. please.
Type: <class 'str'>


In [4]:
print(final_data[['clean_text', 'text_length', 'word_count', 'spacy_polarity', 'spacy_subjectivity', 'vader_compound', 'vader_positive',  'vader_negative', 'vader_neutral']].head())

                                          clean_text  text_length  word_count  \
0  super good, don't get me wrong. but i came for...          603         102   
1  i decided to try it out although i’m not a hug...          651         129   
2  my caramel core begins to disappear about half...          715         140   
3  why are people complaining about the blonde br...          565         106   
4  this ice cream is worst ice cream i’ve ever ta...          623         120   

   spacy_polarity  spacy_subjectivity  vader_compound  vader_positive  \
0        0.276094            0.592989          0.9739           0.289   
1        0.064601            0.289146          0.9503           0.167   
2        0.037004            0.349603          0.7889           0.090   
3        0.224038            0.459615          0.9268           0.137   
4       -0.088095            0.549490         -0.4918           0.114   

   vader_negative  vader_neutral  
0           0.120          0.592  
1   

In [7]:
print(final_data[['clean_text', 'text_length', 'word_count', 'spacy_polarity', 'spacy_subjectivity', 'vader_compound', 'vader_positive',  'vader_negative', 'vader_neutral']].tail())

                                              clean_text  text_length  \
21669  there was no chocolate ice cream in this at al...          133   
21670  this ice cream has no flavor at all. no one in...          110   
21671  absolutely love this flavor! the only thing th...          104   
21672  brilliant combo - love the cheesecake and brow...           84   
21673  has a delicious taste with all natural ingredi...           50   

       word_count  spacy_polarity  spacy_subjectivity  vader_compound  \
21669          24       -0.780000               0.910         -0.5379   
21670          23        0.266667               0.575         -0.5106   
21671          18        0.416667               0.700          0.8392   
21672          14        0.840625               0.800          0.8975   
21673           8        0.550000               0.700          0.7351   

       vader_positive  vader_negative  vader_neutral  
21669           0.085           0.203          0.713  
21670       

In [5]:
final_data.to_csv('../data/processed/feature_dataset.csv', index=False)

print("Feature engineering done!!!")
print(f"Final dataset shape: {final_data.shape}")
print(f"Feature columns: {final_data.columns.tolist()}")

Feature engineering done!!!
Final dataset shape: (21674, 37)
Feature columns: ['brand_x', 'key', 'author', 'date', 'stars', 'title', 'helpful_yes', 'helpful_no', 'text', 'taste', 'ingredients_x', 'texture', 'likes', 'clean_text', 'brand_y', 'name', 'subhead', 'description', 'rating', 'rating_count', 'ingredients_y', 'text_length', 'word_count', 'sentence_count', 'exclamation_count', 'question_count', 'capital_ratio', 'spacy_polarity', 'spacy_subjectivity', 'entities_count', 'vader_compound', 'vader_positive', 'vader_negative', 'vader_neutral', 'brand_popularity', 'brand_average', 'sentiment_category']


In [6]:
print("Feature Summary:")
print(final_data[['text_length', 'word_count', 'spacy_polarity', 'spacy_subjectivity']].describe())

print("\nMissing values:")
print(final_data.isnull().sum())

numeric_columns = ['rating', 'text_length', 'word_count', 'spacy_polarity', 'vader_compound', 'vader_positive']
correlction_matrix = final_data[numeric_columns].corr()
print("\nCorrelation with Ratings")
print(correlction_matrix['rating'].sort_values(ascending=False))

print("\nSentiment Methods Comparison:")
sample_reviews = final_data.head(5)
for idx, row in sample_reviews.iterrows():
    print(f"\nReview: {row['clean_text'][:100]}...")
    print(f"Ratings: {row['rating']}")
    print(f"spaCy Polarity: {row['spacy_polarity']:.3f}")
    print(f"VADER Compound: {row['vader_compound']:.3f}")


Feature Summary:
        text_length    word_count  spacy_polarity  spacy_subjectivity
count  21674.000000  21674.000000    21674.000000        21674.000000
mean     228.280059     42.772585        0.310807            0.591225
std      166.591031     31.041878        0.280537            0.175976
min       11.000000      1.000000       -1.000000            0.000000
25%      117.000000     22.000000        0.138136            0.495833
50%      191.000000     36.000000        0.316276            0.600000
75%      281.000000     53.000000        0.494444            0.700000
max     2975.000000    514.000000        1.000000            1.000000

Missing values:
brand_x                   0
key                       0
author                  801
date                      0
stars                     0
title                  5399
helpful_yes               0
helpful_no                0
text                      0
taste                 17409
ingredients_x         17409
texture               17409
