In [22]:
import pandas as pd
import numpy as np
import re
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Yelp Reviews Dataset užkrovimas
df = pd.read_csv('yelp.csv')

print(df.head())

              business_id        date               review_id  stars  \
0  9yKzy9PApeiPPOUJEtnvkg  2011-01-26  fWKvX83p0-ka4JS3dc6E5A      5   
1  ZRJwVLyzEJq1VAihDhYiow  2011-07-27  IjZ33sJrzXqU-0X6U8NwyA      5   
2  6oRAC4uyJCsJl1X0WZpVSA  2012-06-14  IESLBzqUCLdSzSqm0eCSxQ      4   
3  _1QQZuf4zZOyFCvXc0o6Vg  2010-05-27  G-WvGaISbqqaMHlNnByodA      5   
4  6ozycU1RpktNG2-1BroVtw  2012-01-05  1uJFq2r5QfJG_6ExMRCaGw      5   

                                                text    type  \
0  My wife took me here on my birthday for breakf...  review   
1  I have no idea why some people give bad review...  review   
2  love the gyro plate. Rice is so good and I als...  review   
3  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...  review   
4  General Manager Scott Petello is a good egg!!!...  review   

                  user_id  cool  useful  funny  
0  rLtl8ZkDX5vH5nAx9C3q5Q     2       5      0  
1  0a2KyEL0d3Yb1V6aivbIuQ     0       0      0  
2  0hT2KtfLiobPvh6cDC8JQg     0    

In [23]:
# Nereikalingu duomenu pasalinimas
df = df[['review_id', 'text', 'stars']]

# Teksto paruošimo funkcija
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.strip()
    return text

df['cleaned_text'] = df['text'].apply(clean_text)
df.dropna(inplace=True)

# Zvaigzduciu konvertavimas i sentimenta kur 1 (pozityvus atsiliepimas) jeigu 4 arba 5 zvaigzdes ir 0 jeigu maziau
# Reikalinga modeliu mokymui
df['sentiment'] = df['stars'].apply(lambda x: 1 if x >= 4 else 0 if x <= 2 else np.nan)
df.dropna(subset=['sentiment'], inplace=True)
y = df['sentiment']
df = df[['review_id', 'cleaned_text', 'sentiment']]

print(df.head())

                review_id                                       cleaned_text  \
0  fWKvX83p0-ka4JS3dc6E5A  my wife took me here on my birthday for breakf...   
1  IjZ33sJrzXqU-0X6U8NwyA  i have no idea why some people give bad review...   
2  IESLBzqUCLdSzSqm0eCSxQ  love the gyro plate rice is so good and i also...   
3  G-WvGaISbqqaMHlNnByodA  rosie dakota and i love chaparral dog park its...   
4  1uJFq2r5QfJG_6ExMRCaGw  general manager scott petello is a good egg no...   

   sentiment  
0        1.0  
1        1.0  
2        1.0  
3        1.0  
4        1.0  


In [24]:
# TF-IDF vektorizavimas
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()

In [25]:
def train_test_split_with_indices(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    train_index, test_index = train_test_split(np.arange(len(X)), test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test, train_index, test_index

# Duomenu padalinimas i mokymo ir testavimo rinkinius
X_train, X_test, y_train, y_test, train_index, test_index = train_test_split_with_indices(X, y, test_size=0.2, random_state=42)

# Modeliu inicializavimas
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Neural Network': MLPClassifier(max_iter=300)
}

# Modeliu mokymas
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{name} - Accuracy: {accuracy_score(y_test, y_pred)}, Recall: {recall_score(y_test, y_pred)}, F1: {f1_score(y_test, y_pred)}")

    # Modelio issaugojimas
    joblib.dump(model, f'{name.replace(" ", "_").lower()}_model.pkl')

Logistic Regression - Accuracy: 0.8881733021077284, Recall: 0.9919117647058824, F1: 0.9338871581862236
Random Forest - Accuracy: 0.8284543325526932, Recall: 0.9970588235294118, F1: 0.902495840266223
Neural Network - Accuracy: 0.9104215456674473, Recall: 0.9647058823529412, F1: 0.9449045732805185


In [26]:
# Paliginus apmokytus modelius geriausiai save parode neuroninis tinklas
best_model = joblib.load('neural_network_model.pkl')

# Sentimentu nustatymas testavimo atsiliepimuose
df_test = df.iloc[test_index]
df_test['predicted_sentiment'] = best_model.predict(X_test)

# Funkcija, nustatanti kurie atsiliepimai reikalauja reakcijos
# Yra galimybe nustatyti raktinius zodsius, kuriuos radus atsiliepimas taps reakcijos reikalaujanciu
# Taip pat atsiliepimas taps reakcijos reikalaujanciu jeigu medelio atsiliepimo sentimento analizes rezultatas bus lygus 0.
def identify_responses(review, predicted_sentiment):
    keywords = ['urgent', 'complaint', 'issue', 'problem', 'bad', 'horrible', 'worst']
    if predicted_sentiment == 0 or any(word in review for word in keywords):
        return True
    return False

# Funkcijos iskvietimas
df_test['needs_response'] = df_test.apply(lambda row: identify_responses(row['cleaned_text'], row['predicted_sentiment']), axis=1)

# Reakcijos reikalaujanciu atsiliepimu isvedimas
print(df_test[df_test['needs_response'] == True].head())

                   review_id  \
2486  fBh8nGMOjsZ1DOYQKNxfCQ   
7261  _ya44j3weaEcU8z21jeRNw   
5296  9yc5vhQCCPgHWPWcM72xJw   
3473  jN-O8-qaVLIslx-dc_s-CA   
733   BK4FpIlwDx2mAhk9qsK3sQ   

                                           cleaned_text  sentiment  \
2486  f \n\nshe didnt even look at me didnt bother a...        0.0   
7261  the bad the bar closes at pm  it seems like th...        1.0   
5296  this place is essentially a copy of the old fa...        0.0   
3473  skip the reviews that reference anything about...        1.0   
733   bland and lackluster food that is not worth th...        0.0   

      predicted_sentiment  needs_response  
2486                  0.0            True  
7261                  1.0            True  
5296                  1.0            True  
3473                  1.0            True  
733                   0.0            True  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predicted_sentiment'] = best_model.predict(X_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['needs_response'] = df_test.apply(lambda row: identify_responses(row['cleaned_text'], row['predicted_sentiment']), axis=1)
