<a href="https://colab.research.google.com/github/Aya11ali/Shouf/blob/main/Comment_Analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Comment Analysis

In [1]:
pip install -q transformers torch langdetect

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m107.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.7 MB/s[0m eta [

In [2]:
from transformers import pipeline
from transformers import MarianMTModel, MarianTokenizer
from langdetect import detect, LangDetectException
import re
from abc import ABC, abstractmethod

In [3]:
class IModelLoader(ABC):
  @abstractmethod
  def load_model(self):
    pass

class English_model(IModelLoader):
  def __init__(self, toxicity_model="unitary/toxic-bert"):
    self.toxicity_model = toxicity_model
    self.sentiment_analyzer = None
    self.toxic_analyzer = None

  def load_model(self):
      # English models
      self.sentiment_analyzer = pipeline("sentiment-analysis")
      self.toxic_analyzer = pipeline("text-classification", model=self.toxicity_model)

class Arabic_model(IModelLoader):
  def __init__(self,sentiment_model="CAMeL-Lab/bert-base-arabic-camelbert-da-sentiment",
               toxic_model="Hate-speech-CNERG/dehatebert-mono-arabic"):
    self.sentiment_model = sentiment_model
    self.toxic_model = toxic_model
    self.sentiment_analyzer = None
    self.toxic_analyzer = None

  def load_model(self):
      # Arabic models
      self.sentiment_analyzer = pipeline("text-classification", model=self.sentiment_model)
      self.toxic_analyzer = pipeline("text-classification", model=self.toxic_model)


In [4]:
class ModelConfig:
    def __init__(self,toxicity_threshold=0.7):
      self.toxicity_threshold = toxicity_threshold

In [29]:
# Interface
class ITranslator(ABC):
    @abstractmethod
    def load(self):
        pass

    @abstractmethod
    def translate(self, text: str) -> str:
        pass

class Translator(ITranslator):
    def __init__(self,model_name='Helsinki-NLP/opus-mt-ar-en'):
        self.model_name = model_name
        self.model = None
        self.tokenizer = None

    def load(self):
        self.tokenizer = MarianTokenizer.from_pretrained(self.model_name)
        self.model = MarianMTModel.from_pretrained(self.model_name)

    def translate(self, text: str) -> str:
        inputs = self.tokenizer(text, return_tensors="pt", padding=True)
        translated = self.model.generate(**inputs)
        return self.tokenizer.decode(translated[0], skip_special_tokens=True)

In [30]:
class Iclean(ABC):
    @abstractmethod
    def clean_text(self, text: str) -> str:
        pass

class Clean(Iclean):
    def clean_text(self, text: str) -> str:
        text = text.lower()
        text = re.sub(r"http\S+", "", text)  # remove URLs
        text = re.sub(r"@\w+", "", text)     # remove mentions
        text = re.sub(r"#[A-Za-z0-9_]+", "", text)  # remove hashtags
        text = re.sub(r"[^a-zA-Zأ-ي\s]", "", text)  # remove symbols except Arabic
        return text.strip()

In [31]:
class ILanguageDetector(ABC):
  @abstractmethod
  def detect_language(self, text: str) -> str:
    pass

class LanguageDetector(ILanguageDetector):
  def detect_language(self, text: str) -> str:
    try:
      return detect(text)
    except LangDetectException:
      return None

In [32]:
class IAnalyzer(ABC):
    @abstractmethod
    def analyze(self, text: str) -> dict:
        pass

class CommentAnalyzer(IAnalyzer):
    def __init__(self, model_loader: IModelLoader, translator: ITranslator = None,config:ModelConfig = None):
        self.model_loader = model_loader
        self.translator = translator
        self.config=config
        self.model_loader.load_model()

    def analyze(self, text: str) -> dict:
        # self.model_loader.load_model()
        temp_text = text
        if self.translator:
            self.translator.load()
            temp_text = self.translator.translate(temp_text)
        sentiment = self.model_loader.sentiment_analyzer(temp_text)[0]
        toxicity = self.model_loader.toxic_analyzer(temp_text)[0]

        toxicity_score = round(toxicity["score"], 3)
        toxicity_label = "TOXIC" if toxicity_score > self.config.toxicity_threshold else "NON_TOXIC"

        return {
            "comment" : text,
            "sentiment": sentiment["label"],
            "toxicity_score": toxicity_score,
            "toxicity_label": toxicity_label
        }

In [33]:
class IBatch(ABC):
  @abstractmethod
  def comments_batch(self,comments:list)->list:
    pass

class Batch(IBatch):
  def __init__(self, analyzer: IAnalyzer):
    self.analyzer = analyzer

  def comments_batch(self,comments:list)->list:
    return [self.analyzer.analyze(comment) for comment in comments]


In [34]:
class CommentProcessingPipeline:
  def __init__ (self, config=ModelConfig(toxicity_threshold=0.7)):
    self.config = config
    self.language_detector = LanguageDetector()
    self.cleaner = Clean()

    self.english_model = English_model()
    self.arabic_model = Arabic_model()
    self.translator = Translator()

    self.english_analyzer = CommentAnalyzer(self.english_model,None,self.config)
    self.arabic_analyzer = CommentAnalyzer(self.english_model, self.translator,self.config)
    # self.batch = Batch()

  # def load(self):
  #   self.translator.load()

  def process_comment(self, comment: str) -> dict:
    cleaned = self.cleaner.clean_text(comment)
    lang = self.language_detector.detect_language(cleaned)

    if lang == "ar":
        analyzer = self.arabic_analyzer
    elif lang == "en":
        analyzer = self.english_analyzer
    else:
        return {"error": "Unsupported or undetectable language"}

    return analyzer.analyze(cleaned)

  def process_comments(self, comments: list) -> list:
      return [self.process_comment(comment) for comment in comments]



# Final Functions


In [37]:
from transformers import logging
logging.set_verbosity_error()  # Only errors will be shown


In [38]:
if __name__ == "__main__":
    comment_pipeline = CommentProcessingPipeline()

    comments = [
        "This is terrible!",
        "هذا تعليق سام",
        "I love this product!",
        "منتج فظيع",
        "أنت غبي وما تفهم أي شيء، مكانك في الزبالة"
    ]

    results = comment_pipeline.process_comments(comments)
    for res in results:
        print(res)


{'comment': 'this is terrible', 'sentiment': 'NEGATIVE', 'toxicity_score': 0.062, 'toxicity_label': 'NON_TOXIC'}
{'comment': 'هذا تعليق سام', 'sentiment': 'POSITIVE', 'toxicity_score': 0.001, 'toxicity_label': 'NON_TOXIC'}
{'comment': 'i love this product', 'sentiment': 'POSITIVE', 'toxicity_score': 0.001, 'toxicity_label': 'NON_TOXIC'}
{'comment': 'منتج فظيع', 'sentiment': 'NEGATIVE', 'toxicity_score': 0.021, 'toxicity_label': 'NON_TOXIC'}
{'comment': 'أنت غبي وما تفهم أي شي مكانك في الزبالة', 'sentiment': 'NEGATIVE', 'toxicity_score': 0.987, 'toxicity_label': 'TOXIC'}
