<a href="https://colab.research.google.com/github/Artem1s1337/nlp_amazon_reviews/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
import re
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from typing import List, Dict

class SimpleSentimentAnalyzer:
    def __init__(self):
        self.model = None

    def parse_fasttext_data(self, file_path: str) -> pd.DataFrame:
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"–§–∞–π–ª {file_path} –Ω–µ –Ω–∞–π–¥–µ–Ω")

        texts, labels = [], []

        with open(file_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                match = re.match(r'__label__(\d+)\s(.+)', line)
                if match:
                    label = int(match.group(1))
                    text = match.group(2).strip()
                    if text:
                        texts.append(text)
                        labels.append(label)
                else:
                    print(f"‚ö†Ô∏è  –ù–µ–≤–µ—Ä–Ω—ã–π —Ñ–æ—Ä–º–∞—Ç –≤ —Å—Ç—Ä–æ–∫–µ {line_num}")

        if not texts:
            raise ValueError("–ù–µ—Ç –≤–∞–ª–∏–¥–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö")

        print(f"–ó–∞–≥—Ä—É–∂–µ–Ω–æ {len(texts)} –ø—Ä–∏–º–µ—Ä–æ–≤")
        return pd.DataFrame({'text': texts, 'label': labels})

    def preprocess_text(self, text: str) -> str:
        text = re.sub(r'\s+', ' ', text)
        return text.strip().lower()

    def prepare_data(self, df: pd.DataFrame):
        df["text"] = df["text"].apply(self.preprocess_text)
        return train_test_split(df["text"], df["label"], test_size=0.2, random_state=42)

    def train(self, X_train, y_train):
        self.model = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1,2))),
            ('clf', LogisticRegression(max_iter=1000))
        ])
        print("–û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...")
        self.model.fit(X_train, y_train)
        print("–û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ.")

    def evaluate(self, X_test, y_test):
        print("–û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏:")
        preds = self.model.predict(X_test)
        print(classification_report(y_test, preds))

    def predict(self, texts: List[str]) -> List[Dict]:
        processed = [self.preprocess_text(t) for t in texts]
        predictions = self.model.predict(processed)
        probs = self.model.predict_proba(processed)

        results = []
        for text, label, prob in zip(texts, predictions, probs):
            results.append({
                "text": text,
                "predicted_label": label,
                "confidence": max(prob)
            })

        return results

def main():
    analyzer = SimpleSentimentAnalyzer()
    df = analyzer.parse_fasttext_data("/content/train.ft.txt")

    X_train, X_test, y_train, y_test = analyzer.prepare_data(df)

    analyzer.train(X_train, y_train)
    analyzer.evaluate(X_test, y_test)

    texts = [
        "Very bad product",
        "Absolutely wonderful! I‚Äôll order again",
        "It's okay, but I expected more"
    ]

    preds = analyzer.predict(texts)
    print("\nüîÆ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è:")
    for p in preds:
        print(f"–¢–µ–∫—Å—Ç: {p['text']}")
        print(f"–û—Ü–µ–Ω–∫–∞: {p['predicted_label']} (—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å {p['confidence']:.2f})")
        print("-" * 30)

if __name__ == "__main__":
    main()


–ó–∞–≥—Ä—É–∂–µ–Ω–æ 3040000 –ø—Ä–∏–º–µ—Ä–æ–≤
üîß –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏...
‚úÖ –û–±—É—á–µ–Ω–∏–µ –∑–∞–≤–µ—Ä—à–µ–Ω–æ.
üìä –û—Ü–µ–Ω–∫–∞ –º–æ–¥–µ–ª–∏:
              precision    recall  f1-score   support

           1       0.67      0.80      0.73     51181
           2       0.47      0.21      0.29     27267
           3       0.49      0.42      0.45     43163
           4       0.61      0.35      0.44     79027
           5       0.87      0.96      0.91    407362

    accuracy                           0.80    608000
   macro avg       0.62      0.55      0.56    608000
weighted avg       0.77      0.80      0.78    608000


üîÆ –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è:
–¢–µ–∫—Å—Ç: Very bad product
–û—Ü–µ–Ω–∫–∞: 1 (—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 0.93)
------------------------------
–¢–µ–∫—Å—Ç: Absolutely wonderful! I‚Äôll order again
–û—Ü–µ–Ω–∫–∞: 5 (—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 0.99)
------------------------------
–¢–µ–∫—Å—Ç: It's okay, but I expected more
–û—Ü–µ–Ω–∫–∞: 3 (—É–≤–µ—Ä–µ–Ω–Ω–æ—Å—Ç—å 0.83)