<a href="https://colab.research.google.com/github/DawidJag/NLP_sentiment_analysis_polish/blob/main/NLP_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Our goal is to prepare model for sentiment analysis in texts written in Polish. As a base we will use srapped data (reviews) from allegro.pl website. 

In [1]:
# importing neccessary libriaries
import requests
import pandas as pd
from bs4 import BeautifulSoup


In [2]:
# Create a function to scrape the data
def scrape_data(page_number, sentiment='negative'):

  if sentiment == 'negative':
    sentiment_bool = 'false'
  else:
    sentiment_bool = 'true'

  URL = 'https://allegro.pl/uzytkownik/OleOlepl/oceny?recommend=' + sentiment_bool + '&page=' + str(page_number)
  page = requests.get(URL)
  soup = BeautifulSoup(page.text, 'html.parser')

  rows = soup.find_all('div', attrs={'class':"_4lqfj"})

  for i in rows:
    row = i.find('p', attrs={'class':'_1h7wt _9hx3e'})

    text.append(row.text.strip())
    sentiment_list.append(sentiment)

  return (text, sentiment_list)

In [None]:
# Extracting all data
no_of_pages = 100

# clearing the lists
text = []
sentiment_list = []

for page in range(1,no_of_pages+1):
  (text, sentiment_list) = scrape_data(page, sentiment='negative')

for page in range(201,5*no_of_pages+1):
  (text, sentiment_list) = scrape_data(page, sentiment='positive')

In [None]:
# Create a DataFrame with all data
df = pd.DataFrame(columns=['text', 'sentiment'])
df['text'] = text
df['sentiment'] = sentiment_list

df.head(20)

Unnamed: 0,text,sentiment
0,"""Nie polecam.Lodówka z tyłu podklejonataśmą.Ja...",negative
1,"""Lodówka super (Niestety obsługa w sklepie nie...",negative
2,"""W ofercie sprzedaży sprzedający wyraźnie wska...",negative
3,"""Skandaliczna obsługa dostawców - co ważne są ...",negative
4,"""Otrzymałem zamówiony dzbanek filtrujący oznac...",negative
5,"""bardzo długi czas oczekiwania """,negative
6,"""Tydzień oczekiwania na przesyłkę, która nie p...",negative
7,"""Dostawa darmowa, ale już o tym że wniesienie ...",negative
8,"""Absolutnie zalose w jaki sposob towar zostal ...",negative
9,"""Otrzymałem uszkodzony towar. Zwrot deski do ...",negative


In [None]:
df.to_csv('scrapped_data.csv')

In [3]:
# Additional cell with code which should be run only if we use already scrapped data from csv file ('scrapped_data.csv')

df = pd.read_csv('scrapped_data.csv', )
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,text,sentiment
0,"""Nie polecam.Lodówka z tyłu podklejonataśmą.Ja...",negative
1,"""Lodówka super (Niestety obsługa w sklepie nie...",negative
2,"""W ofercie sprzedaży sprzedający wyraźnie wska...",negative
3,"""Skandaliczna obsługa dostawców - co ważne są ...",negative
4,"""Otrzymałem zamówiony dzbanek filtrujący oznac...",negative


In [5]:
df.shape

(3389, 2)

In [6]:
df['sentiment'].value_counts()

positive    1726
negative    1663
Name: sentiment, dtype: int64

In [4]:
# Dataset preprocessing 
df['text'].replace(regex=True,inplace=True,to_replace=r'[\,"-()]',value=r'')
df['text'].replace(regex=True,inplace=True,to_replace=r'[\.]',value=r' ')

df['text'] = df['text'].str.lower()
df.head(10)

Unnamed: 0,text,sentiment
0,nie polecam lodówka z tyłu podklejonataśmą jak...,negative
1,lodówka super niestety obsługa w sklepie nieko...,negative
2,w ofercie sprzedaży sprzedający wyraźnie wskaz...,negative
3,skandaliczna obsługa dostawców - co ważne są o...,negative
4,otrzymałem zamówiony dzbanek filtrujący oznacz...,negative
5,bardzo długi czas oczekiwania,negative
6,tydzień oczekiwania na przesyłkę która nie prz...,negative
7,dostawa darmowa ale już o tym że wniesienie od...,negative
8,absolutnie zalose w jaki sposob towar zostal z...,negative
9,otrzymałem uszkodzony towar zwrot deski do p...,negative


## **Model**

---



In [5]:
from sklearn.model_selection import train_test_split

In [6]:
# Preparation of the data
X = df['text'].values
y = df['sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

Bag of words vectorization

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# vectorizer = CountVectorizer()
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, ngram_range=(1,2))
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)


In [8]:
print(X_train_vectors.shape)
print(X_test_vectors.shape)
print(y_train.shape)
print(y_test.shape)

(2711, 40734)
(678, 40734)
(2711,)
(678,)


## Classification

We will check 4 models:


*   SVC
*   Logistic regression
*   Decision tree classifier
*   Random forest classifier





In [9]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


clf_svm = SVC(kernel='linear', random_state=42)
clf_svm.fit(X_train_vectors, y_train)
y_pred_svm = clf_svm.predict(X_test_vectors)

clf_log = LogisticRegression(random_state=42)
clf_log.fit(X_train_vectors, y_train)
y_pred_log = clf_log.predict(X_test_vectors)

clf_tree = DecisionTreeClassifier(random_state=42)
clf_tree.fit(X_train_vectors, y_train)
y_pred_tree = clf_tree.predict(X_test_vectors)

clf_forest = RandomForestClassifier(random_state=42)
clf_forest.fit(X_train_vectors, y_train)
y_pred_forest = clf_forest.predict(X_test_vectors)

Evaluation of models

In [10]:
print(clf_svm.score(X_test_vectors, y_test))
print(clf_log.score(X_test_vectors, y_test))
print(clf_tree.score(X_test_vectors, y_test))
print(clf_forest.score(X_test_vectors, y_test))

0.9380530973451328
0.9365781710914455
0.8849557522123894
0.9100294985250738


In [11]:
from sklearn.metrics import f1_score

print(f1_score(y_test, y_pred_svm, average=None))
print(f1_score(y_test, y_pred_log, average=None))
print(f1_score(y_test, y_pred_tree, average=None))
print(f1_score(y_test, y_pred_forest, average=None))

[0.94150418 0.93416928]
[0.94133697 0.93097913]
[0.88323353 0.88662791]
[0.91223022 0.90771558]


As we can see our models reached quite high accuracy. Let's check how they will perform with manually created examples.

In [16]:
# Checking predictions on randomly created examples
test_set = ['ta książka jest słaba.', 'lampka przyszła zniszczona', 'sprzęt godny polecenia', 'naprawdę dobry sprzęt']
new_set = vectorizer.transform(test_set)

print(clf_svm.predict(new_set))
print(clf_log.predict(new_set))
print(clf_tree.predict(new_set))
print(clf_forest.predict(new_set))

['negative' 'positive' 'positive' 'positive']
['negative' 'negative' 'positive' 'positive']
['positive' 'positive' 'positive' 'positive']
['positive' 'positive' 'positive' 'positive']


Results: 
only logistic regression predict correctly sentiment. We will try hyperparameter tuning to see if we can improve these results.

### Hyperparameters tunning

In [28]:
# Preparation of param grids for all classifiers
svm_grid = {'kernel': ['linear', 'rbf', 'poly'], 'C': [1,4,8,16,32], 'random_state': [42]}
log_grid = {'penalty': ['l1', 'l2'], 'C': [1,4,8,16,32], 'solver': ['lbfgs', 'liblinear'], 'random_state': [42]}
tree_grid = {'criterion': ['gini', 'entropy'], 'max_depth': [2,4,6,8,10], 'min_samples_leaf': [4,5,6,7,8,9], 'random_state': [42]}
forest_grid = {'n_estimators': [100,150,200], 'criterion': ['entropy'], 'max_depth': [6,8,10,12,14], 'min_samples_leaf': [4,5,6], 'random_state': [42]}

In [14]:
from sklearn.model_selection import GridSearchCV

gs_svm = GridSearchCV(SVC(), param_grid = svm_grid, cv=5, verbose=True)
gs_svm.fit(X_train_vectors, y_train)

gs_svm.score(X_test_vectors, y_test)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  1.4min finished


0.93952802359882

In [15]:
gs_log = GridSearchCV(LogisticRegression(), param_grid = log_grid, cv=5, verbose=True)
gs_log.fit(X_train_vectors, y_train)

gs_log.score(X_test_vectors, y_test)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   16.1s finished


0.93952802359882

In [16]:
gs_tree = GridSearchCV(DecisionTreeClassifier(), param_grid = tree_grid, cv=5, verbose=True)
gs_tree.fit(X_train_vectors, y_train)

gs_tree.score(X_test_vectors, y_test)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 300 out of 300 | elapsed:   45.8s finished


0.8716814159292036

In [29]:
gs_forest = GridSearchCV(RandomForestClassifier(), param_grid = forest_grid, cv=5, verbose=True, n_jobs=-1)
gs_forest.fit(X_train_vectors, y_train)

gs_forest.score(X_test_vectors, y_test)

Fitting 5 folds for each of 45 candidates, totalling 225 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 225 out of 225 | elapsed:  2.5min finished


0.8569321533923304

In [31]:
print(f'SVM score: {clf_svm.score(X_test_vectors, y_test):.4f} vs Grid Search score: {gs_svm.score(X_test_vectors, y_test):.4f}')
print(f'Logistic regression score: {clf_log.score(X_test_vectors, y_test):.4f} vs Grid Search score: {gs_log.score(X_test_vectors, y_test):.4f}')
print(f'Decision tree score: {clf_tree.score(X_test_vectors, y_test):.4f} vs Grid Search score: {gs_tree.score(X_test_vectors, y_test):.4f}')
print(f'Random forest score: {clf_forest.score(X_test_vectors, y_test):.4f} vs Grid Search score: {gs_forest.score(X_test_vectors, y_test):.4f}')

SVM score: 0.9381 vs Grid Search score: 0.9395
Logistic regression score: 0.9366 vs Grid Search score: 0.9395
Decision tree score: 0.8850 vs Grid Search score: 0.8717
Random forest score: 0.9100 vs Grid Search score: 0.8569


Conclusions:
as we can see in this case grid search hasn't brought much improvement to our models. SVM model and logistic regression performed almost on the same level of accuracy (~93%). Next steps would be to go further with hyperparameters tuning for those two models and attempt to obtain more data (especially from different sources to have wider range of words).


---


To have bigger dataset we have downloaded data available on the following gitHub profile: https://github.com/Ermlab/pl-sentiment-analysis

As author of this dataset wrote on the profile data has been collected from few sources:

1. Opineo - Polish service with all reviews from online shops
2. Twitter - Polish current top hashtags from political news and Polish Election Campaign 2015
3. Polish Academy of Science HateSpeech project
4. YouTube - comments from various videos







In [174]:
data = pd.read_csv('Kopia polish_sentiment_dataset.csv')
# data = df.drop('Unnamed: 0', axis=1)
data.shape

(936883, 3)

In [175]:
data.head()

Unnamed: 0,description,length,rate
0,Polecam nie pierwszy i nie ostatni raz!,39.0,1.0
1,Bardzo dobra komunikacja sms i telefoniczna. Z...,121.0,1.0
2,Polecam zakupy w tym sklepie. Są dostępne częś...,87.0,1.0
3,0,0.0,0.0
4,Jestem w pełni zadowolona z przebiegu transakcji,48.0,1.0


In [178]:
# Let's check if our data is balanced
data['rate'].value_counts()

 1.0    734250
-1.0    184020
 0.0     18547
Name: rate, dtype: int64

In [187]:
# As we have imbalanced data we will do undersamplig
data_positive = data[data['rate'] == 1].sample(n=184020, random_state=42)
data_negative = data[data['rate'] == -1]

data_new = data_positive.append(data_negative, ignore_index=True)
data_new['text'] = data_new['description']
data_new['sentiment'] = data_new['rate'].map({1: 'positive', -1: 'negative'})
data_new = data_new.drop(['description', 'rate', 'length'], axis=1)
data_new

Unnamed: 0,text,sentiment
0,"Wszystko w porządku! Sprawnie, dostałem to, co...",positive
1,Polecam serdecznie . Korzystam często,positive
2,Bardzo miła i szybka obsługa pomimo mojej pomy...,positive
3,"Wiarygodny, rzetelny,. bardzo szybka realizacj...",positive
4,Ogólna Ocena na 5 . Niestety nie było wszystki...,positive
...,...,...
368035,Coraz lepiej wygląda,negative
368036,JA SRAM NA TEN PIERSCIONEK I NA CIEBIE CHWILE ...,negative
368037,Rafatus do Marleny Ty kurwo bez honoru ...,negative
368038,matka Marleny prosi o pomoc,negative


In [189]:
# Data preprocessing
data_new['text'].replace(regex=True,inplace=True,to_replace=r'[\,"-()!]',value=r'')
data_new['text'].replace(regex=True,inplace=True,to_replace=r'[\.]',value=r' ')

data_new['text'] = data_new['text'].str.lower()
data_new.head(10)

Unnamed: 0,text,sentiment
0,wszystko w porządku sprawnie dostałem to co za...,positive
1,polecam serdecznie korzystam często,positive
2,bardzo miła i szybka obsługa pomimo mojej pomy...,positive
3,wiarygodny rzetelny bardzo szybka realizacja ...,positive
4,ogólna ocena na 5 niestety nie było wszystki...,positive
5,transakcja bez zastrzeżeń dość szybko i najważ...,positive
6,chyba najlepszy sklep internetowy naprawdę god...,positive
7,wszystko sprawnie ekspresowa dostawa polecam,positive
8,korzystałam z usług uwolnijkolory pl już kilka...,positive
9,obsługa na najwyższym poziomie szybko towar w...,positive


In [190]:
# Adding new data to our dataset scrapped from allegro.pl
df_extended = df.append(data_new, ignore_index=True)
df_extended

Unnamed: 0,text,sentiment
0,nie polecam lodówka z tyłu podklejonataśmą jak...,negative
1,lodówka super niestety obsługa w sklepie nieko...,negative
2,w ofercie sprzedaży sprzedający wyraźnie wskaz...,negative
3,skandaliczna obsługa dostawców - co ważne są o...,negative
4,otrzymałem zamówiony dzbanek filtrujący oznacz...,negative
...,...,...
371424,coraz lepiej wygląda,negative
371425,ja sram na ten pierscionek i na ciebie chwile ...,negative
371426,rafatus do marleny ty kurwo bez honoru ...,negative
371427,matka marleny prosi o pomoc,negative


In [25]:
# df_extended.to_csv('data_extended.csv')

In [34]:
# Additional cell with code which should be run only if we use already scrapped data from csv file ('scrapped_data.csv')
df_extended = pd.read_csv('data_extended_NLP_sentiment_analysis.csv')
df_extended = df_extended.drop('Unnamed: 0', axis=1)
df_extended

Unnamed: 0,text,sentiment
0,nie polecam lodówka z tyłu podklejonataśmą jak...,negative
1,lodówka super niestety obsługa w sklepie nieko...,negative
2,w ofercie sprzedaży sprzedający wyraźnie wskaz...,negative
3,skandaliczna obsługa dostawców - co ważne są o...,negative
4,otrzymałem zamówiony dzbanek filtrujący oznacz...,negative
...,...,...
371441,coraz lepiej wygląda,negative
371442,ja sram na ten pierscionek i na ciebie chwile ...,negative
371443,rafatus do marleny ty kurwo bez honoru ...,negative
371444,matka marleny prosi o pomoc,negative


In [35]:
# Checking if we have NA values in new dataset
df_extended[df_extended['text'].isna() == True]

Unnamed: 0,text,sentiment
50912,,
54310,,
71441,,
79253,,
82035,,
...,...,...
371108,,negative
371167,,negative
371334,,negative
371348,,negative


In [36]:
# Removing NA values
df_extended = df_extended.dropna()
df_extended[df_extended['text'].isna() == True]

Unnamed: 0,text,sentiment


Let's train our models again with larger dataset

In [61]:
# Preparation of the data
data_tmp = df_extended.sample(frac=0.2, replace=False, random_state=42) # big data set => first training on smaller sample

X_ext = data_tmp['text'].values
y_ext = data_tmp['sentiment'].values

X_train_ext, X_test_ext, y_train_ext, y_test_ext = train_test_split(X_ext, y_ext, test_size=0.2, shuffle=True, random_state=42)

vectorizer_ext = TfidfVectorizer(sublinear_tf=True, max_df=0.5, ngram_range=(1,2))
X_train_ext_vectors = vectorizer_ext.fit_transform(X_train_ext)
X_test_ext_vectors = vectorizer_ext.transform(X_test_ext)


In [62]:
print(X_train_ext_vectors.shape)
print(X_test_ext_vectors.shape)
print(y_train_ext.shape)
print(y_test_ext.shape)

(59325, 485006)
(14832, 485006)
(59325,)
(14832,)


In [63]:
clf_svm_ext = SVC(kernel='linear', random_state=42)
clf_svm_ext.fit(X_train_ext_vectors, y_train_ext)
# y_pred_svm_ext = clf_svm_ext.predict(X_test_ext_vectors)

print(clf_svm_ext.score(X_test_ext_vectors, y_test_ext))

0.9837513484358145


In [54]:
clf_log_ext = LogisticRegression(random_state=42)
clf_log_ext.fit(X_train_ext_vectors, y_train_ext)
# y_pred_log_ext = clf_log_ext.predict(X_test_ext_vectors)

print(clf_log_ext.score(X_test_ext_vectors, y_test_ext))

0.9777498089630062


In [55]:
clf_tree_ext = DecisionTreeClassifier(random_state=42)
clf_tree_ext.fit(X_train_ext_vectors, y_train_ext)
# y_pred_tree_ext = clf_tree_ext.predict(X_test_ext_vectors)

print(clf_tree_ext.score(X_test_ext_vectors, y_test_ext))

0.962197150177552


In [56]:
clf_forest_ext = RandomForestClassifier(random_state=42)
clf_forest_ext.fit(X_train_ext_vectors, y_train_ext)
# y_pred_forest_ext = clf_forest_ext.predict(X_test_ext_vectors)

print(clf_forest_ext.score(X_test_ext_vectors, y_test_ext))

0.9784690070571312


In [59]:
# Checking predictions on randomly created examples
test_set_2 = ['ta trasa nie będzie dla nas dobra', 'to ciasto było za słone', 'gra jest zabawna', 
              'produkt wart swojej ceny', 'towar do dupy', 'bardzo fajny sprzedawca', 'w miarę szybka przesyłka']
new_set_2 = vectorizer_ext.transform(test_set_2)

print(clf_svm_ext.predict(new_set_2))
print(clf_log_ext.predict(new_set_2))
print(clf_tree_ext.predict(new_set_2))
print(clf_forest_ext.predict(new_set_2))

['negative' 'negative' 'negative' 'positive' 'negative' 'positive'
 'positive']
['negative' 'negative' 'negative' 'positive' 'negative' 'positive'
 'positive']
['negative' 'negative' 'negative' 'positive' 'positive' 'positive'
 'positive']
['negative' 'negative' 'negative' 'positive' 'negative' 'positive'
 'positive']
