In [None]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)

review_data_raw = pd.read_excel("review_data.xlsx")
review_data = review_data_raw[['brand', 'source_platform', 'source_type',
                              'date_time', 'comment_text_translated']]
review_data.rename(columns={'comment_text_translated':'review','date_time':'date',
                            'source_platform':'review_platform', 'source_type':'review_type'},inplace=True)
review_data['date'] = pd.to_datetime(review_data['date']).dt.date

review_data.sample(5)

Unnamed: 0,brand,review_platform,review_type,date,review
4643,Stella Artois,Facebook,Social Media,2025-01-15,"Stella D‚ÄôItalia is here to satisfy all your pizza cravings. We offer fast and attentive service, with pizzas made to order from fresh and tasty ingredients.\n\nWhether it's a classic Margherita or a more original creation, we have something to delight your taste buds.\n\n[emoji_telephone_receiver] Order now or discover our options online at https://stelladitalia-express.be/."
3286,Jupiler,Facebook,Social Media,2025-04-19,This is what players and fans of Zulte Waregem can expect after promotion to the Jupiler Pro League
8217,Jupiler,Facebook,Social Media,2025-08-23,A new centurion in the #jupilerproleague! [emoji_United_States][emoji_hundred_points]
197,Stella Artois,Twitter,Social Media,2025-04-07,"@KwakDeZak @StoffelBundy Stella is everywhere! And if they don't have it, you better make yourself scarce."
13880,Jupiler,Facebook,Social Media,2025-09-27,It‚Äôs been a while since Club Brugge won at Sclessin‚Ä¶ [emoji_exploding_head]\n\nTake a look at the goal scorers in that game. [emoji_eyes]\n\n#jupilerproleague


## Review preprocessing

In [2]:
import re
from langdetect import detect_langs, DetectorFactory

corona_review = review_data[review_data['brand']=='Corona'].drop_duplicates().dropna().reset_index(drop=True)

"""
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Clean the reviews
"""
def clean_text(text):
    if pd.isna(text):
        return ""
    
    # 1. Remove emoji placeholders like [emoji_tropical_drink]
    text = re.sub(r"\[emoji_[^\]]+\]", "", text)
    # 2. Lowercase
    text = text.lower() 
    # 3. Remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    # 4. Keep only letters, numbers, basic punctuation
    text = re.sub(r"[^a-zA-Z0-9\s.,!?\$‚Ç¨¬£‚Çπ%/-]", "", text)
    # 5. Normalize whitespace
    text = re.sub(r"\s+", " ", text).strip()
    
    return text


corona_review['review_cleaned'] = corona_review['review'].apply(clean_text)


"""
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Check if the review is in english or not
"""

DetectorFactory.seed = 0  # for consistent results

def is_english(text):
    try:
        langs = detect_langs(text)
        # get highest probability
        top = max(langs, key=lambda x: x.prob)
        return top.lang == 'en' and top.prob > 0.90
    except:
        return False

corona_review['is_english'] = corona_review['review_cleaned'].apply(is_english)


"""
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Translate non English reviews to English
"""
from deep_translator import GoogleTranslator

def translate_to_english(text):
    try:
        return GoogleTranslator(source='auto', target='en').translate(text)
    except:
        return text  # fallback: return original
    
corona_review.loc[corona_review['is_english'] == False,'review_cleaned'] = corona_review.loc[corona_review['is_english'] == False,'review_cleaned'].apply(translate_to_english)


"""
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Extract emojis from the reviews
"""

def extract_emojis(text):
    if pd.isna(text):
        return "No Emojis"
    
    emojis = re.findall(r"\[emoji_[^\]]+\]", text)
    
    if len(emojis) == 0:
        return "No Emojis"
    
    return emojis

corona_review['emojis'] = corona_review['review'].apply(extract_emojis)


"""
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Extract hashtags from the reviews
"""
def extract_hashtags(text: str):
    if pd.isna(text):
        return "No Hashtags"
    
    Hashtags = re.findall(r"#\w+", text)
    
    if len(Hashtags) == 0:
        return "No Hashtags"
    
    return Hashtags

corona_review['hashtags'] = corona_review['review'].apply(extract_hashtags)

corona_review.sample(5)



Unnamed: 0,brand,review_platform,review_type,date,review,review_cleaned,is_english,emojis,hashtags
89,Corona,Twitter,Social Media,2025-02-03,"@MattWalshBlog The good beer is brewed in Belgium not in the US. Your Budweiser tastes as bad as Corona, gringo.","mattwalshblog the good beer is brewed in belgium not in the us. your budweiser tastes as bad as corona, gringo.",True,No Emojis,No Hashtags
225,Corona,Untappd,Online Review,2025-07-28,Effe nulletje tussendoor,a little zero in between,False,No Emojis,No Hashtags
240,Corona,Twitter,Social Media,2025-10-17,"Trump is also delivering on his promise in terms of oil prices. Crude oil price fell below the USD 57 per barrel threshold. That is lower than in the pre-corona years, despite a dollar inflation of 26% since 2019. This is going to hurt the Russians a lot.","trump is also delivering on his promise in terms of oil prices. crude oil price fell below the usd 57 per barrel threshold. that is lower than in the pre-corona years, despite a dollar inflation of 26% since 2019. this is going to hurt the russians a lot.",True,No Emojis,No Hashtags
144,Corona,Facebook,Social Media,2025-07-11,"Time for a new promo! \n\nNot two beers, but six beers for the price of five beers. [emoji_clinking_beer_mugs]\n\n#cafeplastron #beers #summer #coronaextra","time for a new promo! not two beers, but six beers for the price of five beers. cafeplastron beers summer coronaextra",True,[[emoji_clinking_beer_mugs]],"[#cafeplastron, #beers, #summer, #coronaextra]"
167,Corona,Facebook,Social Media,2025-05-30,"[emoji_clinking_beer_mugs] Corona makes you dream of sunny beaches and brings the sun closer with every sip! Treat yourself to a nice cold Corona Extra or a Corona Cero 0.0% non-alcoholic with your favorite dish, and taste the sun with every sip.\n\nUntil June 29 at an irresistible summer price [emoji_sun]","corona makes you dream of sunny beaches and brings the sun closer with every sip! treat yourself to a nice cold corona extra or a corona cero 0.0% non-alcoholic with your favorite dish, and taste the sun with every sip. until june 29 at an irresistible summer price",True,"[[emoji_clinking_beer_mugs], [emoji_sun]]",No Hashtags


## Check the relevance of the reviews

In [3]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")
ref_emb = model.encode("This is a review about beer", convert_to_tensor=True)

beer_keywords = ["beer", "brew", "lager", "ipa", "stout", "pils", "ale", "corona", 
                 "heineken", "budweiser", "jupiler", "leffe","craft beer", "bottle", "keg",
                 'taste', 'lime', 'lemon', 'brewery']

exclude_keywords = ["covid", "covid19", "covid-19", "coronavirus", "corona virus",
                    "pandemic", "outbreak", "lockdown", "quarantine", "isolation",
                    "social distancing", "stay at home", "covid test", "tested positive",
                    "positive test", "negative test", "antigen", "pcr", "rapid test",
                    "epidemic", "virus", "viral infection"]

def is_beer_related(text, threshold=0.45):
    
    # Rule-based exclusion
    if any(kword in text for kword in exclude_keywords):
        return False

    # Rule-based inclusion
    if any(kword in text for kword in beer_keywords):
        return True
    
    # Semantic fallback
    emb = model.encode(text, convert_to_tensor=True)
    score = util.cos_sim(emb, ref_emb).item()
    return score >= threshold


corona_review["is_beer_related"] = corona_review["review_cleaned"].apply(is_beer_related)

corona_review.sample(5)

Unnamed: 0,brand,review_platform,review_type,date,review,review_cleaned,is_english,emojis,hashtags,is_beer_related
40,Corona,Facebook,Social Media,2025-04-25,"More sad news this week [emoji_pensive_face] Paul Loyen, also much too young and unexpectedly passed away. Until a few years ago, always lived here on the street. Often you would pop in for a Jupiler Blue or to quickly play some cards. During corona, you helped us a lot with refreshing the caf√©. Still very grateful for that. May you rest well up there. Much strength to family & friends [emoji_folded_hands_medium-light_skin_tone][emoji_black_heart][emoji_dizzy]","more sad news this week paul loyen, also much too young and unexpectedly passed away. until a few years ago, always lived here on the street. often you would pop in for a jupiler blue or to quickly play some cards. during corona, you helped us a lot with refreshing the caf. still very grateful for that. may you rest well up there. much strength to family friends",True,"[[emoji_pensive_face], [emoji_folded_hands_medium-light_skin_tone], [emoji_black_heart], [emoji_dizzy]]",No Hashtags,True
111,Corona,Facebook,Social Media,2025-03-19,"Corona beer 0%, this week still at 1+1 free!","corona beer 0%, this week still at 11 free!",True,No Emojis,No Hashtags,True
248,Corona,Tiktok,Social Media,2025-10-16,PLEASE STAY HOME UNTIL CORONA IS OVER #foryou#belgium[emoji_Belgium] #coronatime#blowthisup#fyp,please stay home until corona is over foryoubelgium coronatimeblowthisupfyp,True,[[emoji_Belgium]],"[#foryou, #belgium, #coronatime, #blowthisup, #fyp]",True
198,Corona,Instagram,Social Media,2025-08-14,"Find me at the beach this Friday to celebrate 100 years @corona with @vilebrequin [emoji_shorts]\n\n[emoji_round_pushpin] Riverwoods Knokke\n[emoji_eight-thirty] 2pm \n\nThis is an open event, everybody is welcome and hope to see you there [emoji_sun_with_face][emoji_umbrella_on_ground]\n\n#advertising #corona100 #thisisliving","find me at the beach this friday to celebrate 100 years corona with vilebrequin riverwoods knokke 2pm this is an open event, everybody is welcome and hope to see you there advertising corona100 thisisliving",True,"[[emoji_shorts], [emoji_round_pushpin], [emoji_eight-thirty], [emoji_sun_with_face], [emoji_umbrella_on_ground]]","[#advertising, #corona100, #thisisliving]",True
74,Corona,Facebook,Social Media,2025-04-30,"1‚Ç¨ for Corona? Yes, yes, you read that right... Come refresh yourself on April 30th and May 1st! [emoji_beer_mug][emoji_fire] #CoronaTime","1‚Ç¨ for corona? yes, yes, you read that right... come refresh yourself on april 30th and may 1st! coronatime",True,"[[emoji_beer_mug], [emoji_fire]]",[#CoronaTime],True


In [4]:
# pd.set_option('display.colheader_justify', 'left')
# corona_review[corona_review['is_beer_related'] == False]['review_cleaned']
corona_review['is_beer_related'].value_counts()

is_beer_related
True     260
False     35
Name: count, dtype: int64

### Remove Non-Beer related reviews

In [5]:
corona_review = corona_review[corona_review['is_beer_related'] == True].reset_index(drop=True)

corona_review.sample(5)

Unnamed: 0,brand,review_platform,review_type,date,review,review_cleaned,is_english,emojis,hashtags,is_beer_related
172,Corona,Twitter,Social Media,2025-08-08,"@Bob4Florida @quetzalponk @lang_domin97000 @BoxCarLabs @dinosaurs1969 Corona is not German style because it contains corn. The same goes for Stouts that contain adjuncts. German style contains only water, barley, hops & yeast, nothing more or it isn't German style.\n\nTrappist is not a style, it contains multiple. It only indicates that it is brewed by monks.","bob4florida quetzalponk langdomin97000 boxcarlabs dinosaurs1969 corona is not german style because it contains corn. the same goes for stouts that contain adjuncts. german style contains only water, barley, hops yeast, nothing more or it isnt german style. trappist is not a style, it contains multiple. it only indicates that it is brewed by monks.",True,No Emojis,No Hashtags,True
199,Corona,Instagram,Social Media,2025-08-10,"[emoji_beer_mug] BEGINNER OR EXPERT? WE DON‚ÄôT CARE! [emoji_grinning_face_with_smiling_eyes] Doctor Beer welcomes EVERYONE in its delirious tastings! [emoji_party_popper] [emoji_woman_shrugging] YOU DON‚ÄôT KNOW ANYTHING ABOUT BEER? Perfect! We start with 'that‚Äôs good, that‚Äôs not so good' and presto, magic! [emoji_sparkles] [emoji_nerd_face] ARE YOU A HOPPY GEEK? Cool! Come challenge your taste buds and learn some crazy things! [emoji_exploding_head] [emoji_circus_tent] ATMOSPHERE GUARANTEED: Funny explanations (no hassle!) Incredible beers for all tastes Epic moments between enthusiasts and curious Science fun without complicated blah blah [emoji_backhand_index_pointing_right] WANT TO LAUGH WHILE TASTING? The next session is waiting for you! Come as you are! [emoji_raising_hands] www.doctorbeer.be #beertime #beerlovers #craftbeerlovers #beer #beertastings #doctorbeer #belgianbeer #beertasting #craftbeer #beers","beginner or expert? we dont care! doctor beer welcomes everyone in its delirious tastings! you dont know anything about beer? perfect! we start with thats good, thats not so good and presto, magic! are you a hoppy geek? cool! come challenge your taste buds and learn some crazy things! atmosphere guaranteed funny explanations no hassle! incredible beers for all tastes epic moments between enthusiasts and curious science fun without complicated blah blah want to laugh while tasting? the next session is waiting for you! come as you are! www.doctorbeer.be beertime beerlovers craftbeerlovers beer beertastings doctorbeer belgianbeer beertasting craftbeer beers",True,"[[emoji_beer_mug], [emoji_grinning_face_with_smiling_eyes], [emoji_party_popper], [emoji_woman_shrugging], [emoji_sparkles], [emoji_nerd_face], [emoji_exploding_head], [emoji_circus_tent], [emoji_backhand_index_pointing_right], [emoji_raising_hands]]","[#beertime, #beerlovers, #craftbeerlovers, #beer, #beertastings, #doctorbeer, #belgianbeer, #beertasting, #craftbeer, #beers]",True
89,Corona,Twitter,Social Media,2025-02-10,"@dabagcilarr @h0neyyr0se @EfeKaygisiz35 @demarkesports One of them held on, that was Sergen Yal√ßin, if it weren't for Corona, you would have taken the championship with a 1-point difference, and you took it because Galatasaray stumbled.","dabagcilarr h0neyyr0se efekaygisiz35 demarkesports one of them held on, that was sergen yalin, if it werent for corona, you would have taken the championship with a 1-point difference, and you took it because galatasaray stumbled.",True,No Emojis,No Hashtags,True
124,Corona,Twitter,Social Media,2025-02-01,"TR: Bigbrother watches you! This means that this mRNA Corona vaccine (+5G-7G) keeps you monitored 24/7 with your car. The issue here is to make you subconsciously ready for use without knowing its price or quality. Perhaps Turkey is the easiest target for globalists. Because they left nothing called opposition. They play around like elementary school children. They quietly dealt with potentially strong competitors under the guise of FETO, cleaning up within the TSK under this or that guise like an Asian union. In this case, we need people like DELI PETRO. The committee urgently needs the script of the events mentioned in the three-volume book these days. Today or tomorrow, maybe even sooner than tomorrow. Stay safe.","tr bigbrother watches you! this means that this mrna corona vaccine 5g-7g keeps you monitored 24/7 with your car. the issue here is to make you subconsciously ready for use without knowing its price or quality. perhaps turkey is the easiest target for globalists. because they left nothing called opposition. they play around like elementary school children. they quietly dealt with potentially strong competitors under the guise of feto, cleaning up within the tsk under this or that guise like an asian union. in this case, we need people like deli petro. the committee urgently needs the script of the events mentioned in the three-volume book these days. today or tomorrow, maybe even sooner than tomorrow. stay safe.",True,No Emojis,No Hashtags,True
163,Corona,Facebook,Social Media,2025-05-20,"[emoji_fire] ùóôùóúùó°ùóîùóüùóò ùóòùó®ùó•ùó¢ùó£ùóî ùóüùóòùóîùóöùó®ùóò [emoji_soccer_ball]Ô∏è \n\n[emoji_fire] ùóñùóò ùó†ùóòùó•ùóñùó•ùóòùóóùóú ùüÆùü≠/ùü¨ùü± , ùüÆùü≠ùóõùü¨ùü¨\n\n[emoji_right_arrow] Big screen, atmosphere, live commentary, and special promotions during the match!\n\n[emoji_right_arrow] ùó£ùó•ùó¢ùó†ùó¢ ùóóùó®ùó•ùóîùó°ùóß ùóüùóò ùó†ùóîùóßùóñùóõ [emoji_partying_face]\n\n[emoji_backhand_index_pointing_right] 3 DESP√â purchased = 1 free\n[emoji_backhand_index_pointing_right] 3 CORONA purchased = 1 free *\n[emoji_backhand_index_pointing_right] 1 METER of beer = 25‚Ç¨ *\n* also available alcohol-free\n\n#Bowling #bowlingthemis[emoji_bowling] #promo #europaleague #EuropaLeague #foot","/ , big screen, atmosphere, live commentary, and special promotions during the match! 3 desp purchased 1 free 3 corona purchased 1 free 1 meter of beer 25‚Ç¨ also available alcohol-free bowling bowlingthemis promo europaleague europaleague foot",True,"[[emoji_fire], [emoji_soccer_ball], [emoji_fire], [emoji_right_arrow], [emoji_right_arrow], [emoji_partying_face], [emoji_backhand_index_pointing_right], [emoji_backhand_index_pointing_right], [emoji_backhand_index_pointing_right], [emoji_bowling]]","[#Bowling, #bowlingthemis, #promo, #europaleague, #EuropaLeague, #foot]",True


## Stopwords removal

In [6]:
import nltk
from nltk.corpus import stopwords
import re

nltk.download('stopwords')

def clean_stopwords(text):
    # Base NLTK stopword list
    base_sw = set(stopwords.words("english"))
    
    # Words we want to keep for sentiment & topics
    keep_words = {
        "not", "no", "never",          # negation is important
        "very", "too", "more",         # intensity important in sentiment
        "good", "bad", "great", "amazing", "terrible",  # sentiment words
        "beer", "ipa", "lager", "stout", "ale", "pils", "corona"  # domain words
    }
    
    # Remove kept words from stopwords
    base_sw = base_sw - keep_words

    # Add domain-specific stopwords
    domain_sw = {
        "drink", "drinking", "drank", "alcohol",
        "brew", "brewing", "brewed",
        "bottle", "can", "canned", "glass", "pint",
        "episode", "review", "reviews",
        "beerlovers", "beerlover"
    }
    
    final_stopwords = base_sw.union(domain_sw)
    
    # Tokenize
    words = re.findall(r"\b\w+\b", text.lower())
    
    # Remove stopwords
    filtered = [w for w in words if w not in final_stopwords]
    
    return " ".join(filtered)

corona_review["review_cleaned"] = corona_review["review_cleaned"].apply(clean_stopwords)

corona_review.sample(5)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhij\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,brand,review_platform,review_type,date,review,review_cleaned,is_english,emojis,hashtags,is_beer_related
108,Corona,Facebook,Social Media,2025-01-13,"Dear fashionistas, Are you already keeping SATURDAY, MARCH 8th free in your agenda? With our 10th anniversary, we had to close the day before due to Corona. This was unreal! But now, with our 15th anniversary year, we are going to celebrate double and pamper you extra. We are organizing a VIP afternoon & a VIP evening so that you can all be present. Love, Katleen [emoji_pink_heart] [emoji_pink_heart]SAVE THE DATE [emoji_pink_heart]",dear fashionistas already keeping saturday march 8th free agenda 10th anniversary close day due corona unreal 15th anniversary year going celebrate double pamper extra organizing vip afternoon vip evening present love katleen save date,True,"[[emoji_pink_heart], [emoji_pink_heart], [emoji_pink_heart]]",No Hashtags,True
246,Corona,Twitter,Social Media,2025-09-22,Check out our new limited edition beer! Perfect for the summer vibes. [emoji_clinking_beer_mugs] #SummerBrew #LimitedEdition @BeerLovers,check new limited edition beer perfect summer vibes summerbrew limitededition,True,[[emoji_clinking_beer_mugs]],"[#SummerBrew, #LimitedEdition]",True
87,Corona,Twitter,Social Media,2025-01-01,"@TimeConfessor @pusholder It was taken during the Corona period, otherwise it's impossible.",timeconfessor pusholder taken corona period otherwise impossible,True,No Emojis,No Hashtags,True
20,Corona,Twitter,Social Media,2025-01-07,"Dry January, day 6. Corona Cero is far superior to Heineken 0.0. The folks at @Heineken_col say it resembles the original a lot, but they couldn't be more wrong. So far, the closest to the original is Stella 0.0.",dry january day 6 corona cero far superior heineken 0 0 folks heinekencol say resembles original lot couldnt more wrong far closest original stella 0 0,True,No Emojis,No Hashtags,True
154,Corona,Facebook,Social Media,2025-07-26,[emoji_clinking_beer_mugs] TODAY; Corona and Corona Cero (0.0) NOW; 1+1 FREE [emoji_red_exclamation_mark]Ô∏è[emoji_star-struck],today corona corona cero 0 0 11 free,True,"[[emoji_clinking_beer_mugs], [emoji_red_exclamation_mark], [emoji_star-struck]]",No Hashtags,True
