# Setting up environment

In [None]:
! pip install snorkel

Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading snorkel-0.10.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.10.0


In [None]:
pip install camel-tools

# Loading Dataset "after preprocessing the data"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

filePath = '/content/drive/MyDrive/Small Dataset_Cleaned.csv'

df_train = pd.read_csv(filePath)
df_train["label"] = df_train["label"].map({"Positive": 1, "Negative": 0, "Mixed": -1})
df_test = df_train



In [None]:
Y_test = df_test.label.values

In [None]:
Y_test

array([1, 1, 1, ..., 0, 0, 0])

In [None]:
df_train.head(5)

Unnamed: 0,label,text
0,Positive,ممتاز نوع نظافه والموقع والتجهيز والشاطيء مطعم
1,Positive,احد اسباب نجاح امار ان شخص دوله يعشق تراب نحب ...
2,Positive,هادف وقو تنقل صخب شوارع قاهره ال هدوء جبال شيش...
3,Positive,خلص مبديي الل مست ابهار زي فيل ازرق ميقراش احس...
4,Positive,ياسا جلوري جزء يتجز دب ندق متكامل خدم مريح نفس...


In [None]:
df_train['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Positive,1000
Mixed,1000
Negative,1000


In [None]:
df_train.tail(5)

Unnamed: 0,label,text
2995,Negative,ضعيف جدا لاش شي يعجب خاص ممر والغراف
2996,Negative,ضعيف موقع مناسب لشرق جنوب رياضالمطعم نوع جيد ع...
2997,Negative,فكره طبع مش جديد مكنتش عامل حساب انه اخر هتكون...
2998,Negative,ضعيف ال وا فا مواقف سيار
2999,Negative,كتاب اول يوح قارء انه مليء اثاره والتشويق براه...


# Weak Labelling

# Define Labels

In [None]:
# For clarity, we define constants for sentiment labels: Negative = 0, Positive = 1
MIXED = -1
NEGATIVE = 0
POSITIVE = 1

# Define labeling functions

In [None]:
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.analysis import get_label_buckets

In [None]:
from camel_tools.tokenizers.word import simple_word_tokenize
@labeling_function()
def keyword_sentiment(x):
    negative_keywords = ['سيء', 'غير جيد', 'مزعج']
    positive_keywords = ['جيد', 'رائع', 'ممتاز']

    if any(word in simple_word_tokenize(x.text) for word in negative_keywords):
        return NEGATIVE
    elif any(word in simple_word_tokenize(x.text) for word in positive_keywords):
        return POSITIVE
    return MIXED
keyword_sentiment.name = "keyword_sentiment"

In [None]:
# Writing an LF to gauge sentiment - that uses a third-party model
from snorkel.preprocess import preprocessor
from textblob import TextBlob
@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    x.subjectivity = scores.sentiment.subjectivity
    return x

In [None]:
@labeling_function(pre=[textblob_sentiment]) # The pre=[textblob_sentiment] argument indicates that the textblob_sentiment preprocessor should be applied before textblob_polarity is executed.
def textblob_polarity(x):
    return POSITIVE if x.polarity > 0.9 else MIXED

In [None]:
@labeling_function(pre=[textblob_sentiment])
def textblob_subjectivity(x):
    return POSITIVE if x.subjectivity >= 0.5 else MIXED

In [None]:
# Writing an LF using a third-party Arabic sentiment model
from camel_tools.sentiment import SentimentAnalyzer
analyzer = SentimentAnalyzer("CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment")

In [None]:
from snorkel.labeling import LabelingFunction

def keyword_lookup(x, keywords, label):
    if any(word in x.text.lower() for word in keywords):
        return label
    return MIXED

def make_keyword_lf(keywords, label=NEGATIVE):  # Default label is now NEGATIVE
    return LabelingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords, label=label),
    )



"""Spam comments expressing negative sentiment like disappointment or frustration."""
keyword_negative = make_keyword_lf(keywords=["سيء", "مزعج", "فاشل", "ممل", "كريه", "أكره"])

"""Negative comments expressing frustration or dislike."""
keyword_frustration = make_keyword_lf(keywords=["ضايق", "غضب", "قرف", "خائب", "غبي"])

"""Comments expressing dissatisfaction with a product or video."""
keyword_dissatisfaction = make_keyword_lf(keywords=["لن أشتري", "لا أنصح", "لم يعجبني", "سيئ", "غير جيد"])

"""Comments expressing regret or negative emotions."""
keyword_regret = make_keyword_lf(keywords=["ندم", "خسارة", "محرج", "أشعر بالحزن"])

"""Negative reviews or criticisms of content."""
keyword_criticize = make_keyword_lf(keywords=["أنتقد", "مشكلة", "خطأ", "سيء", "غير جيد"])

# Generate labels by Applying LFs

In [None]:
lfs = [
    keyword_sentiment,
    textblob_polarity,
    textblob_subjectivity,
    keyword_negative,
    keyword_frustration,
    keyword_dissatisfaction,
    keyword_regret,
    keyword_criticize
]

In [None]:
applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df_train)
L_test = applier.apply(df=df_test)

100%|██████████| 3000/3000 [05:44<00:00,  8.71it/s]
100%|██████████| 3000/3000 [05:36<00:00,  8.92it/s]


In [None]:
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_sentiment,0,"[0, 1]",0.255333,0.098333,0.012667
textblob_polarity,1,[],0.0,0.0,0.0
textblob_subjectivity,2,[],0.0,0.0,0.0
keyword_سيء,3,[0],0.122333,0.094,0.005
keyword_ضايق,4,[0],0.016333,0.005667,0.001
keyword_لن أشتري,5,[0],0.000667,0.000667,0.000333
keyword_ندم,6,[0],0.05,0.017667,0.007667
keyword_أنتقد,7,[0],0.077333,0.077333,0.000333


In [None]:
from snorkel.labeling.model import MajorityLabelVoter

majority_model = MajorityLabelVoter()
preds_train = majority_model.predict(L=L_train)

preds_train

array([ 1, -1, -1, ...,  1, -1, -1])

In [None]:
from snorkel.labeling.model import LabelModel

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=123)

probs_train = label_model.predict_proba(L=L_train)
probs_train

100%|██████████| 500/500 [00:01<00:00, 264.72epoch/s]


array([[0.33767965, 0.66232035],
       [0.5       , 0.5       ],
       [0.5       , 0.5       ],
       ...,
       [0.33767965, 0.66232035],
       [0.5       , 0.5       ],
       [0.5       , 0.5       ]])

In [None]:

# Now calculate Majority Vote accuracy
majority_acc = majority_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

# Calculate Label Model accuracy
label_model_acc = label_model.score(L=L_test, Y=Y_test, tie_break_policy="random")["accuracy"]
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Majority Vote Accuracy:   60.3%
Label Model Accuracy:     60.6%


In [None]:
from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, probs_train_filtered = filter_unlabeled_dataframe(
    X=df_train, y=probs_train, L=L_train
)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 5))
X_train = vectorizer.fit_transform(df_train_filtered.text.tolist())
X_test = vectorizer.transform(df_test.text.tolist())

In [None]:
from snorkel.utils import probs_to_preds
preds_train_filtered = probs_to_preds(probs=probs_train_filtered)

In [None]:
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=1e3, solver="liblinear")
sklearn_model.fit(X=X_train, y=preds_train_filtered)

In [None]:
print(f"Test Accuracy: {sklearn_model.score(X=X_test, y=Y_test) * 100:.1f}%")

Test Accuracy: 42.7%


In [None]:
import pandas as pd

In [None]:
# Example Arabic reviews for prediction
new_review = ['قناتي رائعة! هذه أفضل قناة على الإطلاق', 'الفيديو كان سيئاً جداً']

# Create DataFrame from the new reviews
df = pd.DataFrame(new_review, columns=['review'])

# Vectorize the new reviews using the same vectorizer used for training
df_vectorized = vectorizer.transform(df['review'])


results = sklearn_model.predict(df_vectorized)

for i, item in enumerate(results):
    if item == 0:
        print(f'Review#{i+1} is negative')  # 0 for negative sentiment
    else:
        print(f'Review#{i+1} is positive')  # 1 for positive sentiment

Review#1 is negative
Review#2 is negative
