In [10]:
!pip install scikit-learn
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np



In [38]:
df = pd.read_csv('Reviews.csv')

In [39]:
df['Score'].value_counts()

Score
5    363122
4     80655
1     52268
3     42640
2     29769
Name: count, dtype: int64

In [40]:
print(df['Text'].isnull().sum(),
        df['Score'].isnull().sum())

0 0


In [41]:
df[['Score','Text']].dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[['Score','Text']].dropna(inplace=True)


In [42]:
def convert_score_to_sentiment(score):
    if score <= 2:
        return 0 # negatywny
    elif score == 3:
        return 1 # neutralny
    else:
        return 2 # pozytywny

In [43]:
df['sentiment'] = df['Score'].apply(convert_score_to_sentiment)

In [44]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text

In [45]:
df['Text'] = df['Text'].apply(clean_text)

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english',max_features=1000)
X = vectorizer.fit_transform(df['Text']).toarray()
y = df['sentiment']

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(y_train.shape, y_test.shape, X_test.shape)

(454763,) (113691,) (113691, 1000)


In [77]:

# Konwersja danych na tensory Pytorch
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long) 


In [78]:
class SentimentClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SentimentClassifier, self).__init__()
        # Warstwa wejściowa -> Warstwa ukryta
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        # Warstwa wyjściowa (3 klasy)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.softmax(x)

In [79]:
# Inicjalizacja modelu
input_dim = X_train.shape[1]  # Liczba cech (wymiar wektora TF-IDF)
hidden_dim = 128  # Liczba jednostek w warstwie ukrytej
output_dim = 3    # Liczba klas (pozytywny, neutralny, negatywny)

model = SentimentClassifier(input_dim, hidden_dim, output_dim)

# Ustalamy funkcję straty i optymalizator
criterion = nn.CrossEntropyLoss()  # Funkcja straty dla klasyfikacji wieloklasowej
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [86]:
def train_model(model, X_train, y_train, criterion, optimizer, epochs=25):
    model.train()  # Ustawiamy model w tryb treningowy
    for epoch in range(epochs):
        optimizer.zero_grad()  # Zerujemy gradienty
        output = model(X_train)  # Przekazujemy dane przez model
        loss = criterion(output, y_train)  # Obliczamy stratę
        loss.backward()  # Obliczamy gradienty
        optimizer.step()  # Wykonujemy krok optymalizacji
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

# Trenujemy model
train_model(model, X_train_tensor, y_train_tensor, criterion, optimizer, epochs=150)

Epoch [1/150], Loss: 0.7287
Epoch [2/150], Loss: 0.7283
Epoch [3/150], Loss: 0.7279
Epoch [4/150], Loss: 0.7275
Epoch [5/150], Loss: 0.7271
Epoch [6/150], Loss: 0.7268
Epoch [7/150], Loss: 0.7264
Epoch [8/150], Loss: 0.7261
Epoch [9/150], Loss: 0.7258
Epoch [10/150], Loss: 0.7254
Epoch [11/150], Loss: 0.7251
Epoch [12/150], Loss: 0.7248
Epoch [13/150], Loss: 0.7245
Epoch [14/150], Loss: 0.7242
Epoch [15/150], Loss: 0.7239
Epoch [16/150], Loss: 0.7237
Epoch [17/150], Loss: 0.7234
Epoch [18/150], Loss: 0.7231
Epoch [19/150], Loss: 0.7229
Epoch [20/150], Loss: 0.7226
Epoch [21/150], Loss: 0.7224
Epoch [22/150], Loss: 0.7221
Epoch [23/150], Loss: 0.7219
Epoch [24/150], Loss: 0.7217
Epoch [25/150], Loss: 0.7214
Epoch [26/150], Loss: 0.7212
Epoch [27/150], Loss: 0.7210
Epoch [28/150], Loss: 0.7208
Epoch [29/150], Loss: 0.7206
Epoch [30/150], Loss: 0.7204
Epoch [31/150], Loss: 0.7202
Epoch [32/150], Loss: 0.7200
Epoch [33/150], Loss: 0.7198
Epoch [34/150], Loss: 0.7196
Epoch [35/150], Loss: 0

In [87]:
def evaluate_model(model, X_test, y_test):
    model.eval()  # Ustawiamy model w tryb ewaluacji
    with torch.no_grad():  # Nie obliczamy gradientów podczas ewaluacji
        output = model(X_test)
        _, predicted = torch.max(output, 1)  # Wybieramy klasę o najwyższym prawdopodobieństwie
        accuracy = accuracy_score(y_test, predicted)
        print(f"Accuracy on test set: {accuracy * 100:.2f}%")

# Ewaluacja modelu
evaluate_model(model, X_test_tensor, y_test_tensor)

Accuracy on test set: 84.16%


In [88]:
torch.save(model.state_dict(), 'sentiment_classifier.pth')

In [96]:
def predict_sentiment(text, model, vectorizer):
    # Wektoryzacja tekstu
    text_vector = vectorizer.transform([text]).toarray()  # Konwertujemy na wektor TF-IDF
    
    # Konwersja na tensor Pytorch
    text_tensor = torch.tensor(text_vector, dtype=torch.float32)
    
    # Przewidywanie za pomocą modelu
    model.eval()  # Ustawiamy model w tryb ewaluacji
    with torch.no_grad():
        output = model(text_tensor)
        _, predicted = torch.max(output, 1)  # Wybieramy klasę o najwyższym prawdopodobieństwie
    
    # Mapowanie numeru klasy na nazwę sentymentu
    sentiment_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
    predicted_sentiment = sentiment_map[predicted.item()]  # Pobieramy wartość z tensora
    
    return predicted_sentiment

# Przykładowe dane do przetestowania
text = "This product sucks"

# Testowanie funkcji na przykładowym tekście
predicted_sentiment = predict_sentiment(text, model, vectorizer)
print(f'The sentiment of the text is: {predicted_sentiment}')

The sentiment of the text is: Negative
