<a href="https://colab.research.google.com/github/DaruHashida/MyPyTorch/blob/main/text_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
### 1. Make some data and get ready

import sklearn
from sklearn.datasets import make_circles

In [2]:
import numpy as np
import pandas as pd

In [3]:
##Загружаем данные
data = pd.read_csv('./IMDB Dataset.csv')

In [4]:
## Проверяем формат данных
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [5]:
##Заменяем значение labels на boolean
data['sentiment'] = data['sentiment'].replace(['positive', 'negative'],['1','0'])
## Смотрим, что получилось
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [6]:
#Импортируем токенизатор
from torchtext.data import get_tokenizer

# Initialize object and tokenize each line
pytorch_tokenizer = get_tokenizer("basic_english")

In [7]:
from bs4 import BeautifulSoup
###Убираем html из комментариев
def remove_html (html):
  soup = BeautifulSoup(html)
  return soup.get_text()

data['review']=data['review'].map(lambda com: remove_html(com))

  soup = BeautifulSoup(html)


In [8]:
## Убираем пунктуацию и цифры из комментариев
import re
def remove_punct_dig (text):
  text = re.sub(r'[^\w\s]', ' ',text)
  text = re.sub(r'\b\d+\b', ' digit ', text)
  text = re.sub(r'\bbr\b', '', text)
  text = re.sub(r'\bs\b', '', text)
  return text
data['review']=data['review'].map(lambda com: remove_punct_dig(com))
##Токенизируем текст комментария
data['review'] = data['review'].map(lambda com: pytorch_tokenizer(com))
###Смотрим, что получилось
data.head()

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",1
1,"[a, wonderful, little, production, the, filmin...",1
2,"[i, thought, this, was, a, wonderful, way, to,...",1
3,"[basically, there, a, family, where, a, little...",0
4,"[petter, mattei, love, in, the, time, of, mone...",1


In [9]:
data.head()

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",1
1,"[a, wonderful, little, production, the, filmin...",1
2,"[i, thought, this, was, a, wonderful, way, to,...",1
3,"[basically, there, a, family, where, a, little...",0
4,"[petter, mattei, love, in, the, time, of, mone...",1


In [10]:
type(data)

pandas.core.frame.DataFrame

###Удаляем стоп-слова и лемматизируем наш токенизированный текст!


In [14]:
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

##Удаляем стоп-слова
stop_words = stopwords.words('english');
def remove_stop_words(words):
  return list(filter(lambda x: x not in stop_words,words))
data['review'] = data['review'].map(lambda words_list: remove_stop_words(words_list))

def lemmatize (text):
  lem = WordNetLemmatizer()
  for part_of_speech in ['n','v','a','r','s']:
    lemmated = list(map(lambda token: lem.lemmatize(token,part_of_speech),text))
    return lemmated
data['review'] = data['review'].map(lambda words_list: lemmatize(words_list))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
data.head()

Unnamed: 0,review,sentiment
0,"[one, reviewer, mentioned, watching, digit, oz...",1
1,"[wonderful, little, production, filming, techn...",1
2,"[thought, wonderful, way, spend, time, hot, su...",1
3,"[basically, family, little, boy, jake, think, ...",0
4,"[petter, mattei, love, time, money, visually, ...",1


###ФОРМИРУЕМ ТРЕНИРОВОЧНУЮ И ТЕСТОВУЮ ВЫБОРКИ

In [21]:
y = data['sentiment']
X = data['review']
proportion = int(len(X)*0.8)
X_train, y_train = X[:proportion],y[:proportion]
X_test, y_test = X[proportion:],y[proportion:]

###ПРЕВРАЩАЕМ НАШИ ТЕКСТЫ КОММЕНТАРИЕВ В ТЕНЗОРЫ!

In [24]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=lambda a: a, use_idf=True, ngram_range=(1,1), lowercase=False)
train_features = tfidf.fit_transform(X_train)
test_features = tfidf.transform(X_test)
print(train_features.shape,test_features.shape)



(40000, 84888) (10000, 84888)


## Смотрим, что вышло

In [25]:
##Смотрим, что получилось
tfidf.vocabulary_

{'one': 53358,
 'reviewer': 62688,
 'mentioned': 47847,
 'watching': 81735,
 'digit': 20496,
 'oz': 54475,
 'episode': 24747,
 'hooked': 35410,
 'right': 63022,
 'exactly': 25483,
 'happened': 33391,
 'first': 27586,
 'thing': 75128,
 'struck': 71892,
 'brutality': 10502,
 'unflinching': 78650,
 'scene': 65390,
 'violence': 80827,
 'set': 66685,
 'word': 83323,
 'go': 31160,
 'trust': 77246,
 'show': 67629,
 'faint': 26256,
 'hearted': 34014,
 'timid': 75613,
 'pull': 59425,
 'punch': 59468,
 'regard': 61505,
 'drug': 22460,
 'sex': 66763,
 'hardcore': 33445,
 'classic': 14245,
 'use': 79715,
 'called': 11404,
 'nickname': 51753,
 'given': 30889,
 'oswald': 53881,
 'maximum': 46971,
 'security': 66167,
 'state': 71035,
 'penitentary': 55766,
 'focus': 28165,
 'mainly': 45674,
 'emerald': 24007,
 'city': 14115,
 'experimental': 25810,
 'section': 66154,
 'prison': 58647,
 'cell': 12585,
 'glass': 30953,
 'front': 29153,
 'face': 26158,
 'inwards': 38580,
 'privacy': 58660,
 'high': 3469

###Импортируем loss - функцию и тренируем модель

In [28]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(train_features, y_train)

###Тестируем модель

In [30]:
predictions = clf.predict(test_features)

### Проверяем точность

In [31]:
print(sklearn.metrics.classification_report(y_test, predictions, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.90      0.89      0.89      4993
    Positive       0.89      0.90      0.90      5007

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [34]:
print(sklearn.metrics.confusion_matrix(y_test, predictions,labels=['0','1']))

[[4433  560]
 [ 481 4526]]
