In [93]:
import pandas as pd
from textblob import TextBlob
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [113]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

Данные взяты с платформы kaggle (https://www.kaggle.com/competitions/product-reviews-sentiment-analysis-light/data?select=products_sentiment_train.tsv). В датасете собраны отзывы на товары, необходимо классифицировать все отзывы на 2 категории 1 - положительные, 0 - отрицательные. В файле train.csv отзывы уже размечены. Разделим датасет на несколько частей, в первой части будут отзывы с данными отметками, вторую часть разметим с помощью библиотеки textblob, третью часть в Label Studio, четвертую на основе заданных правил.

In [52]:
df = pd.read_csv('products_sentiment_train.tsv', delimiter='\t')

In [53]:
df.head()

Unnamed: 0,"2 . take around 10,000 640x480 pictures .",1
0,i downloaded a trial version of computer assoc...,1
1,the wrt54g plus the hga7t is a perfect solutio...,1
2,i dont especially like how music files are uns...,0
3,i was using the cheapie pail ... and it worked...,1
4,"you can manage your profile , change the contr...",1


In [54]:
df.columns

Index(['2 . take around 10,000 640x480 pictures .', '1'], dtype='object')

In [55]:
df.shape

(1999, 2)

In [57]:
df.rename(columns={'2 . take around 10,000 640x480 pictures .': 'review', '1': 'mark'}, inplace=True)
df.head()

Unnamed: 0,review,mark
0,i downloaded a trial version of computer assoc...,1
1,the wrt54g plus the hga7t is a perfect solutio...,1
2,i dont especially like how music files are uns...,0
3,i was using the cheapie pail ... and it worked...,1
4,"you can manage your profile , change the contr...",1


In [58]:
# Определение размеров каждой части
size1 = int(len(df) * 0.75)  # Примерно 75% от общего размера
size2 = int(len(df) * 0.10)  # Примерно 10% от общего размера
size3 = int(len(df) * 0.05)  # Примерно 5% от общего размера

In [59]:
part1 = df.iloc[:size1] # часть данных с отметкой
part2 = df.iloc[size1:size1+size2] # 10% для разметки c textblob
part3 = df.iloc[size1+size2:size1+size2+size2] # 10% для разметки по правилам
part4 = df.iloc[size1+size2+size2:] # 5% для разметки в Label Studio

In [60]:
part4.head()

Unnamed: 0,review,mark
1897,"after installation , which was very simple ......",1
1898,no one has ever come to my house and noticed '...,1
1899,it is hot enough to be annoying .,0
1900,the nomad explorer provide most of the functio...,1
1901,this program enables the user to batch convert...,1


In [61]:
part4.tail()

Unnamed: 0,review,mark
1994,"speaker phone quality is good , and poping in ...",1
1995,"the "" movies "" last about 5 seconds .",0
1996,overall i like it .,1
1997,i began taking pics as soon as i got this came...,1
1998,"even after reading some of the instructions , ...",0


In [64]:
part1.shape[0], part2.shape[0], part3.shape[0], part4.shape[0]

(1499, 199, 199, 102)

In [66]:
part2 = part2[['review']]
part2.head()

Unnamed: 0,review
1499,"the possibilities with auto , manual , and the..."
1500,"of course , if you try to reach symantec custo..."
1501,i am impressed with how easy this camera is to...
1502,"i was looking for a compact , rugged ( i carry..."
1503,zennx 's replaceable battery was the deciding ...


In [67]:
def get_sentiment(review):
    blob = TextBlob(review)
    sentiment = blob.sentiment.polarity
    return sentiment

def get_mark_textblob(sentiment):
    if sentiment >= 0:
        return 1
    return 0

In [70]:
part2['sentiment'] = part2.review.apply(get_sentiment)
part2['mark'] = part2.sentiment.apply(get_mark_textblob)
part2.head()

Unnamed: 0,review,sentiment,mark
1499,"the possibilities with auto , manual , and the...",0.058333,1
1500,"of course , if you try to reach symantec custo...",-0.5,0
1501,i am impressed with how easy this camera is to...,0.716667,1
1502,"i was looking for a compact , rugged ( i carry...",0.433333,1
1503,zennx 's replaceable battery was the deciding ...,0.0,1


In [71]:
part2.drop(columns=['sentiment'], inplace=True)
part2.head()

Unnamed: 0,review,mark
1499,"the possibilities with auto , manual , and the...",1
1500,"of course , if you try to reach symantec custo...",0
1501,i am impressed with how easy this camera is to...,1
1502,"i was looking for a compact , rugged ( i carry...",1
1503,zennx 's replaceable battery was the deciding ...,1


In [80]:
part4 = part4[['review']]
part4.head()

Unnamed: 0,review
1897,"after installation , which was very simple ......"
1898,no one has ever come to my house and noticed '...
1899,it is hot enough to be annoying .
1900,the nomad explorer provide most of the functio...
1901,this program enables the user to batch convert...


In [79]:
part4.to_csv('part4.csv', index=False)

![photo1](photo1.jpg)
![photo2](photo2.jpg)
![photo3](photo3.jpg)

In [87]:
part4_ls = pd.read_csv('project.csv')
part4 = part4_ls[['review', 'sentiment']]
part4.head()

Unnamed: 0,review,sentiment
0,"after installation , which was very simple ......",1
1,no one has ever come to my house and noticed '...,1
2,it is hot enough to be annoying .,0
3,the nomad explorer provide most of the functio...,1
4,this program enables the user to batch convert...,1


In [88]:
part4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102 entries, 0 to 101
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     102 non-null    object
 1   sentiment  102 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 1.7+ KB


In [104]:
part4.rename(columns={'sentiment': 'mark'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  part4.rename(columns={'sentiment': 'mark'}, inplace=True)


In [105]:
part4.head()

Unnamed: 0,review,mark
0,"after installation , which was very simple ......",1
1,no one has ever come to my house and noticed '...,1
2,it is hot enough to be annoying .,0
3,the nomad explorer provide most of the functio...,1
4,this program enables the user to batch convert...,1


In [89]:
part3.head()

Unnamed: 0,review,mark
1698,camera quality isn 't bad for the low resoluti...,1
1699,1 ) fragile ( i broke the 1st one within 10 da...,0
1700,the smooth operation variable speed and a set ...,1
1701,i 've had nothing but good luck with t-mobile .,1
1702,"es , the interface took a little getting used ...",0


В этой части датасета проставим метки,  исходя из наличия в отзывах положительных слов. Если данных слов в отзыве нет, то отзыв либо нейтральный, либо отрицательный, ему будет присвоена метка - 0.

In [97]:
positive_words = ['good', 'greate', 'amazing', 'recommend', 'creative', 'fine', 'comfortable', 'pretty', 
                 'as well', 'fantastically', 'terrific', 'love', 'like',  'easy', 'winner', 'perfect', 'nice',
                 'flawlessly', 'happy', 'fast', 'marvel', 'powerful', 'solve', 'bright', 'suprisingly']

In [98]:
def get_sentiment_by_rule(review):
    words = nltk.word_tokenize(review.lower())
    num_positive = sum([1 for word in words if word in positive_words])
    if num_positive > 0:
        return 1
    return 0

In [99]:
part3 = part3[['review']]
part3.head()

Unnamed: 0,review
1698,camera quality isn 't bad for the low resoluti...
1699,1 ) fragile ( i broke the 1st one within 10 da...
1700,the smooth operation variable speed and a set ...
1701,i 've had nothing but good luck with t-mobile .
1702,"es , the interface took a little getting used ..."


In [100]:
part3['mark'] = part3.review.apply(get_sentiment_by_rule)
part3.head()

Unnamed: 0,review,mark
1698,camera quality isn 't bad for the low resoluti...,0
1699,1 ) fragile ( i broke the 1st one within 10 da...,0
1700,the smooth operation variable speed and a set ...,1
1701,i 've had nothing but good luck with t-mobile .,1
1702,"es , the interface took a little getting used ...",0


In [106]:
data = pd.concat([part1, part2, part3, part4])

In [107]:
data.shape

(1999, 2)

In [110]:
data.tail()

Unnamed: 0,review,mark
97,"speaker phone quality is good , and poping in ...",1
98,"the "" movies "" last about 5 seconds .",0
99,overall i like it .,1
100,i began taking pics as soon as i got this came...,1
101,"even after reading some of the instructions , ...",0


In [111]:
data.reset_index(drop=True, inplace=True)

In [112]:
data.tail()

Unnamed: 0,review,mark
1994,"speaker phone quality is good , and poping in ...",1
1995,"the "" movies "" last about 5 seconds .",0
1996,overall i like it .,1
1997,i began taking pics as soon as i got this came...,1
1998,"even after reading some of the instructions , ...",0


In [114]:
X_train, X_test, y_train, y_test = train_test_split(data['review'], data['mark'], test_size=0.3, random_state=42)

In [115]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])

In [116]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', LogisticRegression(max_iter=1000))])

In [117]:
predictions = pipeline.predict(X_test)

In [118]:
accuracy = accuracy_score(y_test, predictions)
accuracy

0.7133333333333334

In [119]:
report = classification_report(y_test, predictions)
report

'              precision    recall  f1-score   support\n\n           0       0.75      0.35      0.48       225\n           1       0.71      0.93      0.80       375\n\n    accuracy                           0.71       600\n   macro avg       0.73      0.64      0.64       600\nweighted avg       0.72      0.71      0.68       600\n'

Построим модель для данного датасета с проставленными метками:

In [120]:
df.head()

Unnamed: 0,review,mark
0,i downloaded a trial version of computer assoc...,1
1,the wrt54g plus the hga7t is a perfect solutio...,1
2,i dont especially like how music files are uns...,0
3,i was using the cheapie pail ... and it worked...,1
4,"you can manage your profile , change the contr...",1


In [121]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['mark'], test_size=0.3, random_state=42)

pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')), 
                     ('clf', LogisticRegression(max_iter=1000))])
pipeline.fit(X_train, y_train)
predictions = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
accuracy

0.73

Данная модель немного лучше. Разметка данных может значительно влиять на качество модели машинного обучения. Корректная и информативная разметка данных является ключевым элементом для построения эффективной модели. Таким образом, произведенная разметка с помощью заданного мною правила и библиотеки textblob повлияла на эффективность модели, разметка в Label Studio более точная, но и более затратная по времени.