# ДЗ 6
Выполнили: Добрынина Анастасия, Тринихина Таисия
## Задача
Нужно решить задачу классификации отзывов о фильмах на положительные и отрицательные. Цель - получить как можно более высокое качество ответов.

In [1]:
import pandas as pd
import numpy as np
import pymorphy2
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score

In [27]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [28]:
test.head()

Unnamed: 0,id,text
0,0,I fail to see the appeal of this series (which...
1,1,According to the budget information given on t...
2,2,I was looking forward to seeing Amanda Peet in...
3,3,This movie is very disappointing for one who h...
4,4,There is absolutely no doubt that this version...


In [29]:
train.head()

Unnamed: 0,id,text,answer
0,0,What a disappointment... admittedly the best o...,0
1,1,This is a pale imitation of the Die Hard franc...,0
2,2,"This good-guy-vs-the-evil-tyrant story, set in...",0
3,3,This is a documentary I came across by chance ...,1
4,4,This installment of Masters of Horror was terr...,0


In [30]:
train.info()
# убедимся, что нет нулевых данных

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      25000 non-null  int64 
 1   text    25000 non-null  object
 2   answer  25000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 586.1+ KB


# TF-IDF

In [6]:
morph = pymorphy2.MorphAnalyzer()

In [7]:
def preprocess_text(text):
    lemmas = ''
    for word in nltk.word_tokenize(text):
        if word.isalpha():
            word = morph.parse(word.lower())[0]
            lemma = word.normal_form
            if lemma not in stops:
                lemmas += lemma + ' '
    return lemmas

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\79998\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# задаем параметры (добавим стоп-слова)
stops = stopwords.words("english")

tfidf = TfidfVectorizer(
    max_features=1000, # в случае плохого качества можно увеличить
    min_df=5,
    analyzer="word", # анализировать по словам или по символам (char)
    stop_words=stops # передаём список стоп-слов из NLTK
)


In [31]:
lemmas = []
for i in train['text']:
    lemmas.append(preprocess_text(i))

In [32]:
train['lemmas'] = lemmas

In [33]:
train_lemmas = train.drop('text',axis=1)

In [34]:
train_lemmas

Unnamed: 0,id,answer,lemmas
0,0,0,disappointment admittedly best prequels story ...
1,1,0,pale imitation die hard franchise sucks low am...
2,2,0,story set century russia may attempt extend st...
3,3,1,documentary came across chance uk tv channel s...
4,4,0,installment masters horror terrible apparently...
...,...,...,...
24995,24995,0,horrible script apparently directed one marine...
24996,24996,1,five years tenko survivors returning home mari...
24997,24997,1,understand critic evaluating quality acting fi...
24998,24998,0,movie pretentious foppish right funny filming ...


In [35]:
# обучаем TF-IDF
X = tfidf.fit_transform(train_lemmas['lemmas']).todense()


new_cols=tfidf.get_feature_names_out()

print(X.shape)

(25000, 1000)


In [36]:
train_tfidf = train_lemmas.drop('lemmas',axis=1)
#присоединяет tf-idf в датасет
train_tfidf = train_tfidf.join(pd.DataFrame(X, columns=new_cols))


In [37]:
X = train_tfidf.dropna() #кажется, ничего не изменилось
X

Unnamed: 0,id,answer,able,absolutely,across,act,acted,acting,action,actor,...,wrote,yeah,year,years,yes,yet,york,young,younger,zombie
0,0,0,0.000000,0.0,0.000000,0.0,0.0000,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.123399,0.0,0.000000,0.0,0.000000,0.0,0.0
1,1,0,0.000000,0.0,0.000000,0.0,0.0000,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
2,2,0,0.000000,0.0,0.000000,0.0,0.0000,0.000000,0.177493,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.171692,0.0,0.000000,0.0,0.0
3,3,1,0.125887,0.0,0.134343,0.0,0.0000,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
4,4,0,0.000000,0.0,0.000000,0.0,0.0000,0.000000,0.103645,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,24995,0,0.000000,0.0,0.000000,0.0,0.0000,0.000000,0.000000,0.0,...,0.0,0.0,0.162739,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
24996,24996,1,0.000000,0.0,0.000000,0.0,0.1598,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.196883,0.0,0.000000,0.0,0.000000,0.0,0.0
24997,24997,1,0.000000,0.0,0.000000,0.0,0.0000,0.122382,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0
24998,24998,0,0.000000,0.0,0.000000,0.0,0.0000,0.000000,0.000000,0.0,...,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0


подготовтим аналогично тестовые данные

In [38]:
lemmas_test = []
for i in test['text']:
    lemmas_test.append(preprocess_text(i))

In [39]:
test['lemmas'] = lemmas_test

In [40]:
test_lemmas = test.drop('text',axis=1)

In [41]:
# обучаем TF-IDF
X1 = tfidf.fit_transform(test_lemmas['lemmas']).todense()


new_cols=tfidf.get_feature_names_out()

print(X1.shape)

(25000, 1000)


In [42]:
test_tfidf = test_lemmas.drop('lemmas',axis=1)
#присоединяет tf-idf в датасет
test_tfidf = test_tfidf.join(pd.DataFrame(X1, columns=new_cols))

In [46]:
X_test = test_tfidf.dropna() #кажется, ничего не изменилось
X_test

Unnamed: 0,id,able,absolutely,accent,across,act,acted,acting,action,actor,...,wrong,wrote,yeah,year,years,yes,yet,york,young,zombie
0,0,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
1,1,0.0,0.000000,0.0,0.0,0.000000,0.0,0.158219,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.100236,0.0
2,2,0.0,0.000000,0.0,0.0,0.290504,0.0,0.044246,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
3,3,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.146824,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
4,4,0.0,0.193430,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,24995,0.0,0.000000,0.0,0.0,0.179958,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
24996,24996,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
24997,24997,0.0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
24998,24998,0.0,0.161324,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.123419,0.0,0.0,0.0,0.000000,0.0


Создание модели

In [43]:
y = np.array(X['answer'] == 0)
X_train = X.drop('answer',axis=1)

In [44]:
model = LogisticRegression()

model.fit(X_train, y)

LogisticRegression()

In [45]:
model.predict(X_test)[:5]

Feature names unseen at fit time:
- accent
- adult
- barely
- bought
- cat
- ...
Feature names seen at fit time, yet now missing:
- alien
- band
- ben
- charles
- christmas
- ...



array([ True,  True,  True,  True,  True])

Проверка качества

In [47]:
y_pred = model.predict(X_test)

Feature names unseen at fit time:
- accent
- adult
- barely
- bought
- cat
- ...
Feature names seen at fit time, yet now missing:
- alien
- band
- ben
- charles
- christmas
- ...



In [48]:
accuracy_score(X_test, y_pred)

ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets

In [None]:
f1_score(X_test, y_pred)

In [112]:
X_test.columns == X_train.columns

array([ True,  True,  True, ..., False, False,  True])

Нужно стандартизировать (наверное)

Что происходит дальше, я не знаю

In [31]:
X.describe()

Unnamed: 0,id,10,15,20,30,80,able,absolutely,across,act,...,wrote,yeah,year,years,yes,yet,york,young,younger,zombie
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,...,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,12499.5,0.01476,0.00281,0.003531,0.003184,0.003452,0.005526,0.006985,0.004308,0.005381,...,0.002887,0.00268,0.008403,0.014187,0.006313,0.009352,0.003438,0.011615,0.002769,0.002772
std,7217.022701,0.044001,0.022137,0.02343,0.022483,0.026784,0.027486,0.030999,0.024725,0.028057,...,0.021672,0.021677,0.0336,0.039323,0.029511,0.03272,0.02565,0.039327,0.021301,0.03143
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6249.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,12499.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,18749.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,24999.0,0.626743,0.43469,0.48664,0.420091,0.741168,0.463374,0.406573,0.490208,0.657187,...,0.487124,0.500716,0.447055,0.500028,0.466081,0.489667,0.678546,0.591885,0.492987,0.872346


In [32]:
sc = StandardScaler()
sc.fit(X.select_dtypes("number"))

StandardScaler()

In [36]:
X = sc.transform(X.select_dtypes("number"))
X_train = pd.DataFrame(X)
X_train.columns = X.select_dtypes("number").columns

In [37]:
X_train.describe()

Unnamed: 0,id,10,15,20,30,80,able,absolutely,across,act,...,wrote,yeah,year,years,yes,yet,york,young,younger,zombie
count,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,...,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0,25000.0
mean,-6.116885e-17,8.666667e-16,4.307132e-16,1.14021e-15,-6.262546000000001e-17,-1.919913e-15,-4.298983e-16,6.954415e-16,-6.350676e-16,-2.079277e-15,...,-3.933009e-16,5.111966e-16,2.106182e-16,6.82423e-16,-9.303203e-16,-8.339951e-16,-2.711003e-15,1.051639e-15,-2.090579e-15,5.891365e-16
std,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,...,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002,1.00002
min,-1.731982,-0.3354625,-0.1269367,-0.1507177,-0.1416245,-0.1288965,-0.2010529,-0.2253249,-0.1742366,-0.1918075,...,-0.1332262,-0.1236582,-0.2500917,-0.3607842,-0.2139107,-0.285824,-0.1340179,-0.2953617,-0.1299937,-0.08818208
25%,-0.8659908,-0.3354625,-0.1269367,-0.1507177,-0.1416245,-0.1288965,-0.2010529,-0.2253249,-0.1742366,-0.1918075,...,-0.1332262,-0.1236582,-0.2500917,-0.3607842,-0.2139107,-0.285824,-0.1340179,-0.2953617,-0.1299937,-0.08818208
50%,0.0,-0.3354625,-0.1269367,-0.1507177,-0.1416245,-0.1288965,-0.2010529,-0.2253249,-0.1742366,-0.1918075,...,-0.1332262,-0.1236582,-0.2500917,-0.3607842,-0.2139107,-0.285824,-0.1340179,-0.2953617,-0.1299937,-0.08818208
75%,0.8659908,-0.3354625,-0.1269367,-0.1507177,-0.1416245,-0.1288965,-0.2010529,-0.2253249,-0.1742366,-0.1918075,...,-0.1332262,-0.1236582,-0.2500917,-0.3607842,-0.2139107,-0.285824,-0.1340179,-0.2953617,-0.1299937,-0.08818208
max,1.731982,13.90879,19.50984,20.61985,18.54347,27.54411,16.65766,12.89073,19.65274,23.23167,...,22.34428,22.97604,13.05537,12.35532,15.57978,14.67985,26.32046,14.75529,23.01481,27.66763


In [47]:
X_test = sc.transform(test.select_dtypes("number"))
X_test = pd.DataFrame(X_test)
X_test.columns = test.select_dtypes("number").columns

Feature names seen at fit time, yet now missing:
- answer



ValueError: X has 1 features, but StandardScaler is expecting 2 features as input.

In [46]:
X.describe().round(3)

Unnamed: 0,id,answer
count,25000.0,25000.0
mean,-0.0,0.0
std,1.0,1.0
min,-1.732,-1.0
25%,-0.866,-1.0
50%,0.0,0.0
75%,0.866,1.0
max,1.732,1.0


Среднее равно 0, все отлично

План работы:
1. нам нужно классифицировать, поэтому самая очевидная модель для нашей задачи - логистическая регрессия 
* сделать tf-idf
* построить логистическую регрессию на основе tf-idf
2. Другой вариант - дерево решений
* сделать 1 дерево
* сделать случайный лес
3. Попробовать метод К ближайших средних
4. сравниь результаты всех подходов и выбрать лучший

(Какая-то странная модель без тф-идф)

In [52]:
X_test = np.array(test)

In [53]:
X_test

array([[0,
        'I fail to see the appeal of this series (which is supposed to be sci-fi). It\'s really just "let\'s see what soap operatically happens this week" and oh, the Cylons are involved through flashbacks.<br /><br />The Cylon "babe" that keeps nailing the other guy is pretty lame, it\'s pretty obvious that T&A was added to the show. Every time she pops up I\'m bewildered as to WTF is supposed to be going on. And don\'t even try to bullsh*t me about "story arcs".<br /><br />It\'s a soap opera with some CGI thrown-in. This is not science fiction aside from the original premise.<br /><br />This series is not everything it\'s worked-up to be. If you like trendy, edgy, dodgy, jumpy, vague editor-on-crack camera work, this show might be for you. Since nerds seem to be raving about this show, it\'s a clear indication that vocal nerds\' opinions have been changed from Picard\'s TNG.'],
       [1,
        "According to the budget information given on this web site Dark Harvest had 

In [51]:
model.predict(X_test)[:5]

Feature names unseen at fit time:
- text
Feature names seen at fit time, yet now missing:
- answer



ValueError: could not convert string to float: 'I fail to see the appeal of this series (which is supposed to be sci-fi). It\'s really just "let\'s see what soap operatically happens this week" and oh, the Cylons are involved through flashbacks.<br /><br />The Cylon "babe" that keeps nailing the other guy is pretty lame, it\'s pretty obvious that T&A was added to the show. Every time she pops up I\'m bewildered as to WTF is supposed to be going on. And don\'t even try to bullsh*t me about "story arcs".<br /><br />It\'s a soap opera with some CGI thrown-in. This is not science fiction aside from the original premise.<br /><br />This series is not everything it\'s worked-up to be. If you like trendy, edgy, dodgy, jumpy, vague editor-on-crack camera work, this show might be for you. Since nerds seem to be raving about this show, it\'s a clear indication that vocal nerds\' opinions have been changed from Picard\'s TNG.'