In [64]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [65]:
df = pd.read_csv('/Users/andrejsmirnov/PycharmProjects/ML2/ML_2_mod/ML3/labeledTrainData.tsv', delimiter='\t')
df = df.drop('id', axis=1)
df

Unnamed: 0,sentiment,review
0,1,With all this stuff going down at the moment w...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,0,The film starts with a manager (Nicholas Bell)...
3,0,It must be assumed that those who praised this...
4,1,Superbly trashy and wondrously unpretentious 8...
...,...,...
24995,0,It seems like more consideration has gone into...
24996,0,I don't believe they made this film. Completel...
24997,0,"Guy is a loser. Can't get girls, needs to buil..."
24998,0,This 30 minute documentary Buñuel made in the ...


In [66]:
# Предобработка текста
tfidf_vectorizer = CountVectorizer()
X = tfidf_vectorizer.fit_transform(df['review'])

X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8838


In [67]:
# Предобработка текста
tfidf_vectorizer = TfidfVectorizer()  # TF-IDF векторизатор
X = tfidf_vectorizer.fit_transform(df['review'])

X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8936


## Предобработка

In [68]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

stop_words = set(stopwords.words('english'))  # Стоп-слова на английском языке
lemmatizer = WordNetLemmatizer()  # Лемматизатор
translator = str.maketrans('', '', string.punctuation)  # Удаление пунктуации


def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Токенизация и приведение к нижнему регистру
    tokens = [word.translate(translator) for word in tokens]  # Удаление пунктуации
    tokens = [word for word in tokens if word not in stop_words]  # Удаление стоп-слов
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Лемматизация слов
    return ' '.join(tokens)  # Объединение слов обратно в текст

# Применение предобработки ко всем текстовым обзорам
df['clean_review'] = df['review'].apply(preprocess_text)

In [69]:
df

Unnamed: 0,sentiment,review,clean_review
0,1,With all this stuff going down at the moment w...,stuff going moment mj started listening music ...
1,1,"\The Classic War of the Worlds\"" by Timothy Hi...",classic war world timothy hines entertaining ...
2,0,The film starts with a manager (Nicholas Bell)...,film start manager nicholas bell giving welc...
3,0,It must be assumed that those who praised this...,must assumed praised film greatest filmed ope...
4,1,Superbly trashy and wondrously unpretentious 8...,superbly trashy wondrously unpretentious 80 ex...
...,...,...,...
24995,0,It seems like more consideration has gone into...,seems like consideration gone imdb review film...
24996,0,I don't believe they made this film. Completel...,nt believe made film completely unnecessary ...
24997,0,"Guy is a loser. Can't get girls, needs to buil...",guy loser ca nt get girl need build picked ...
24998,0,This 30 minute documentary Buñuel made in the ...,30 minute documentary buñuel made early 1930 o...


In [70]:
# Предобработка текста
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df['clean_review'])

X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.892
