На этом занятии мы попробуем задачу регрессии. Данные в этой же папке, будем тренироваться на датасете фильмов с IMDB

Перед обучением обучением модели, нужно подготовить данные:

- найти\собрать данные
- почистить и предобработать
- преобразовать в матрицы 


In [3]:
# импорты необходимых библиотек
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline

# import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [9]:
data = pd.read_csv('IMDB-Movie-Data.csv')
print(data.shape)

data.head(3)

(1000, 12)


Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0


## Что делать с NaN?
Есть 3 варианта

In [5]:
# 1. Убрать строки с NaN
print(data.isna().any())
data.shape

Rank                  False
Title                 False
Genre                 False
Description           False
Director              False
Actors                False
Year                  False
Runtime (Minutes)     False
Rating                False
Votes                 False
Revenue (Millions)     True
Metascore              True
dtype: bool


(1000, 12)

In [6]:
print(data.shape)
tmp = data.dropna()
tmp.shape

(1000, 12)


(838, 12)

In [7]:
# 2. Превратить NaN в 0
print(data.shape)
tmp = data.fillna(0)
print(tmp.shape)

(1000, 12)
(1000, 12)


In [10]:
# 3. Превратить NaN в средние значения по колонке

# вычисляем средние для колонок с пустыми значениями
meta_mean = data.Metascore.mean()
rev_mean = data['Revenue (Millions)'].mean()

#заменяем пустоты на средние значения
data.Metascore.fillna(meta_mean, inplace=True)
data['Revenue (Millions)'].fillna(rev_mean, inplace=True)

# проверяем присутствие NaN
data.isna().any()

Rank                  False
Title                 False
Genre                 False
Description           False
Director              False
Actors                False
Year                  False
Runtime (Minutes)     False
Rating                False
Votes                 False
Revenue (Millions)    False
Metascore             False
dtype: bool

## Подготовка данных

Попробуем предсказывать рейтинг фильма по данным его описания, года, длины в минутах и кассовых сборов

Колонка "Rating" станет **целевой переменной, или таргетом** (y)<br>
Остальных данные будут **обучающей выборкой** (X)

In [11]:
data.Description

0      A group of intergalactic criminals are forced ...
1      Following clues to the origin of mankind, a te...
2      Three girls are kidnapped by a man with a diag...
3      In a city of humanoid animals, a hustling thea...
4      A secret government agency recruits some of th...
                             ...                        
995    A tight-knit team of rising investigators, alo...
996    Three American college students studying abroa...
997    Romantic sparks occur between two dance studen...
998    A pair of friends embark on a mission to reuni...
999    A stuffy businessman finds himself trapped ins...
Name: Description, Length: 1000, dtype: object

In [23]:
# подготовим описания фильмов
data["text"] = data.Description.apply(lambda x: x.lower().split()) 

data["text"]

0      [a, group, of, intergalactic, criminals, are, ...
1      [following, clues, to, the, origin, of, mankin...
2      [three, girls, are, kidnapped, by, a, man, wit...
3      [in, a, city, of, humanoid, animals,, a, hustl...
4      [a, secret, government, agency, recruits, some...
                             ...                        
995    [a, tight-knit, team, of, rising, investigator...
996    [three, american, college, students, studying,...
997    [romantic, sparks, occur, between, two, dance,...
998    [a, pair, of, friends, embark, on, a, mission,...
999    [a, stuffy, businessman, finds, himself, trapp...
Name: text, Length: 1000, dtype: object

In [24]:
input_text = list(data.text.values)

In [25]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(input_text)]
documents[10:12]

[TaggedDocument(words=['the', 'adventures', 'of', 'writer', 'newt', 'scamander', 'in', 'new', "york's", 'secret', 'community', 'of', 'witches', 'and', 'wizards', 'seventy', 'years', 'before', 'harry', 'potter', 'reads', 'his', 'book', 'in', 'school.'], tags=[10]),
 TaggedDocument(words=['the', 'story', 'of', 'a', 'team', 'of', 'female', 'african-american', 'mathematicians', 'who', 'served', 'a', 'vital', 'role', 'in', 'nasa', 'during', 'the', 'early', 'years', 'of', 'the', 'u.s.', 'space', 'program.'], tags=[11])]

обучаем модель на текстах описаний фильмов (можно поизменять параметры)

In [26]:
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [27]:
model.save("D2V.model") # сохранение модели

In [28]:
# так можно посмотреть на векторы текстов, на которых училась модель
# индекс [] около documents -- это индекс текста из датасета

model.dv[documents[0].tags[0]]


array([ 0.04276705,  0.16065946, -0.01802454, -0.17609356,  0.12129277],
      dtype=float32)

Теперь нужно добавить векторы в датасет с остальными параметрами

In [29]:
# создадим список с векторами для каждого текста
vectors = []
for x in documents:
    vec = list(model.dv[x.tags][0])
    vectors.append(vec)

In [30]:
# так получим датафрейм, где все компоненты векторов в отдельных столбцах
split_df = pd.DataFrame(vectors,
                        columns=['v1', 'v2', 'v3','v4',"v5"])

split_df


Unnamed: 0,v1,v2,v3,v4,v5
0,0.042767,0.160659,-0.018025,-0.176094,0.121293
1,0.137881,0.051999,0.064691,-0.501718,0.097941
2,0.229582,0.405558,0.112366,-0.438240,-0.008260
3,-0.002442,0.389076,0.417112,-0.612643,-0.010235
4,0.108675,0.412626,0.116797,-0.393902,0.173733
...,...,...,...,...,...
995,0.288997,0.436289,0.156916,-0.369486,-0.076886
996,0.060347,0.475052,0.278315,-0.555186,0.181968
997,-0.060650,0.377157,0.080728,-0.071429,-0.108188
998,-0.022961,0.187676,0.175473,-0.141030,-0.019457


In [31]:
# теперь добавим его к основному датафрейму
result = data.join(split_df, how='left')
result.shape

(1000, 18)

In [32]:
result.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,text,v1,v2,v3,v4,v5
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,"[a, group, of, intergalactic, criminals, are, ...",0.042767,0.160659,-0.018025,-0.176094,0.121293
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,"[following, clues, to, the, origin, of, mankin...",0.137881,0.051999,0.064691,-0.501718,0.097941
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,"[three, girls, are, kidnapped, by, a, man, wit...",0.229582,0.405558,0.112366,-0.43824,-0.00826
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,"[in, a, city, of, humanoid, animals,, a, hustl...",-0.002442,0.389076,0.417112,-0.612643,-0.010235
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,"[a, secret, government, agency, recruits, some...",0.108675,0.412626,0.116797,-0.393902,0.173733


In [33]:
# переопределим датасет, оставив только важное

data_sm = result[['Runtime (Minutes)',"Year",
                'Rating', 'Votes',
                'Revenue (Millions)','Metascore',"v1","v2","v3","v4","v5"]
              ]


data_sm.head(3)

Unnamed: 0,Runtime (Minutes),Year,Rating,Votes,Revenue (Millions),Metascore,v1,v2,v3,v4,v5
0,121,2014,8.1,757074,333.13,76.0,0.042767,0.160659,-0.018025,-0.176094,0.121293
1,124,2012,7.0,485820,126.46,65.0,0.137881,0.051999,0.064691,-0.501718,0.097941
2,117,2016,7.3,157606,138.12,62.0,0.229582,0.405558,0.112366,-0.43824,-0.00826


## Подготавливаем матрицы

In [34]:
# определяем X и y

X = data_sm.drop(["Rating"],axis=1).values 

display(X, X.shape)

array([[ 1.21000000e+02,  2.01400000e+03,  7.57074000e+05, ...,
        -1.80245433e-02, -1.76093563e-01,  1.21292770e-01],
       [ 1.24000000e+02,  2.01200000e+03,  4.85820000e+05, ...,
         6.46910220e-02, -5.01717925e-01,  9.79408622e-02],
       [ 1.17000000e+02,  2.01600000e+03,  1.57606000e+05, ...,
         1.12365678e-01, -4.38239902e-01, -8.26026779e-03],
       ...,
       [ 9.80000000e+01,  2.00800000e+03,  7.06990000e+04, ...,
         8.07275400e-02, -7.14287534e-02, -1.08188301e-01],
       [ 9.30000000e+01,  2.01400000e+03,  4.88100000e+03, ...,
         1.75472915e-01, -1.41030222e-01, -1.94573998e-02],
       [ 8.70000000e+01,  2.01600000e+03,  1.24350000e+04, ...,
         1.79863721e-01, -3.55260253e-01,  1.29649043e-01]])

(1000, 10)

In [35]:
data_sm.isna().any()

Runtime (Minutes)     False
Year                  False
Rating                False
Votes                 False
Revenue (Millions)    False
Metascore             False
v1                    False
v2                    False
v3                    False
v4                    False
v5                    False
dtype: bool

In [36]:
y = data_sm['Rating'].values # отдельно вынесли массив со значениями скорости ветра
y.shape

(1000,)

Иногда бывает полезно [нормализовать](https://en.wikipedia.org/wiki/Normalization_(statistics)) данные: это позволяет исправить ситуацию, когда признаки представлены в разных единацах измерения. 
Для этого используется StandardScaler. 

До нормализации:

In [37]:
list(X[0])

[121.0,
 2014.0,
 757074.0,
 333.13,
 76.0,
 0.04276705160737038,
 0.16065946221351624,
 -0.018024543300271034,
 -0.17609356343746185,
 0.12129276990890503]

In [38]:
# использзуем стандартизатор
sc = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(sc.fit_transform(X), y, random_state=42)

После:

In [39]:
list(sc.fit_transform(X)[0])

[0.4163497512303056,
 0.37979525138136244,
 3.1126899627963738,
 2.5961363010556906,
 1.0233613578368184,
 -1.1204194627427406,
 -1.1560519912944438,
 -1.7831672933678178,
 1.411335611342395,
 0.4689820475756174]

теперь с данными удобнее работать и обучать

In [40]:
# задаем модель регрессора
# силу регуляризации можно варьировать параметром alpha
regressor = Ridge() 


# обучаем
regressor.fit(X_train, y_train)

In [41]:
# давайте предскажем результат для тестовой выборки

y_preds = regressor.predict(X_test)

### оценка результатов алгоритма

В качестве метрики будем использовать [среднюю абсолютную ошибку](https://www.youtube.com/watch?v=ZejnwbcU8nw). Она показывает отклонение от правильного ответа в тех же единах измерения

*(а вообще есть [разные способы](https://towardsdatascience.com/what-are-the-best-metrics-to-evaluate-your-regression-model-418ca481755b))*

In [42]:
mean_absolute_error(y_test, y_preds) 

0.4996123820139456

Попробуйте разные значения для параметра регуляризации alpha при обучении модели. Как они влияют на величину ошибки?

In [48]:
from sklearn.model_selection import GridSearchCV
parameters = {'alpha':[0.01, 0.1, 1, 10, 100], 'solver':('auto', 'svd', 'cholesky', 'sparse_cg')}
ridge = Ridge()
clf = GridSearchCV(ridge, parameters)
clf.fit(X_train, y_train)
print("Ridge best estimator: ", clf.best_estimator_)
print("Ridge best score: ", clf.best_score_)

Ridge best estimator:  Ridge(alpha=10)
Ridge best score:  0.5066203509807463


In [49]:
ridge = Ridge(alpha=10, solver='svd').fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print("mae:", mean_absolute_error(y_test, y_pred))
print("mse", mean_squared_error(y_test, y_pred))
print("rmse", mean_squared_error(y_test, y_pred, squared=False))

mae: 0.49932769811899047
mse 0.47561757679374567
rmse 0.6896503293653572


In [50]:
lr = LinearRegression().fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("mae:", mean_absolute_error(y_test, y_pred))
print("mse", mean_squared_error(y_test, y_pred))
print("rmse", mean_squared_error(y_test, y_pred, squared=False))

mae: 0.4996454211450566
mse 0.4750695307144416
rmse 0.6892528786406638


In [51]:
parameters = {'alpha':[0.01, 0.1, 1, 10, 50, 100]}
lasso = Lasso()
clf = GridSearchCV(lasso, parameters)
clf.fit(X_train, y_train)
print("Lasso best estimator: ", clf.best_estimator_)
print("Lasso best score: ", clf.best_score_)

Lasso best estimator:  Lasso(alpha=0.01)
Lasso best score:  0.506922929800879


In [52]:
lasso = Lasso(alpha=0.01).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print("mae:", mean_absolute_error(y_test, y_pred))
print("mse", mean_squared_error(y_test, y_pred))
print("rmse", mean_squared_error(y_test, y_pred, squared=False))

mae: 0.4939302155641746
mse 0.4723431560300772
rmse 0.6872722575734286


# Text preprocessing

In [61]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')
nltk.download('stopwords')
def clean_sentence(s):
    
    s = re.sub("[^A-Za-z]", " ", s)
    s = s.lower()
    s = word_tokenize(s)
    stemmer = PorterStemmer()
    s = [stemmer.stem(word) for word in s if word not in set(stopwords.words("english"))]
    return s


[nltk_data] Downloading package punkt to /Users/mzhelezin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mzhelezin/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [62]:
data

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,text
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.130000,76.0,"[a, group, of, intergalactic, criminals, are, ..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.460000,65.0,"[following, clues, to, the, origin, of, mankin..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.120000,62.0,"[three, girls, are, kidnapped, by, a, man, wit..."
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.320000,59.0,"[in, a, city, of, humanoid, animals,, a, hustl..."
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.020000,40.0,"[a, secret, government, agency, recruits, some..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Secret in Their Eyes,"Crime,Drama,Mystery","A tight-knit team of rising investigators, alo...",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,27585,82.956376,45.0,"[a, tight-knit, team, of, rising, investigator..."
996,997,Hostel: Part II,Horror,Three American college students studying abroa...,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,73152,17.540000,46.0,"[three, american, college, students, studying,..."
997,998,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,70699,58.010000,50.0,"[romantic, sparks, occur, between, two, dance,..."
998,999,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,4881,82.956376,22.0,"[a, pair, of, friends, embark, on, a, mission,..."


In [63]:
data["text"] = data.Description.apply(clean_sentence) 


In [65]:
input_text = list(data.text.values)

documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(input_text)]

model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

model.save("D2V.model") # сохранение модели

# так можно посмотреть на векторы текстов, на которых училась модель
# индекс [] около documents -- это индекс текста из датасета
vectors = []
for x in documents:
    vec = list(model.dv[x.tags][0])
    vectors.append(vec)

split_df = pd.DataFrame(vectors,
                        columns=['v1', 'v2', 'v3','v4',"v5"])


result = data.join(split_df, how='left')

data_sm = result[['Runtime (Minutes)',"Year",
                'Rating', 'Votes',
                'Revenue (Millions)','Metascore',"v1","v2","v3","v4","v5"]
              ]




X = data_sm.drop(["Rating"],axis=1).values 


y = data_sm['Rating'].values # отдельно вынесли массив со значениями скорости ветра
sc = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(sc.fit_transform(X), y, random_state=42)

In [66]:
from sklearn.model_selection import GridSearchCV
parameters = {'alpha':[0.01, 0.1, 1, 10, 100], 'solver':('auto', 'svd', 'cholesky', 'sparse_cg')}
ridge = Ridge()
clf = GridSearchCV(ridge, parameters)
clf.fit(X_train, y_train)
print("Ridge best estimator: ", clf.best_estimator_)
print("Ridge best score: ", clf.best_score_)

Ridge best estimator:  Ridge(alpha=10, solver='sparse_cg')
Ridge best score:  0.5094305818057882


In [67]:
ridge = Ridge(alpha=10, solver='svd').fit(X_train, y_train)
y_pred = ridge.predict(X_test)
print("mae:", mean_absolute_error(y_test, y_pred))
print("mse", mean_squared_error(y_test, y_pred))
print("rmse", mean_squared_error(y_test, y_pred, squared=False))

mae: 0.49920634375790574
mse 0.4756989958755981
rmse 0.689709356088199


In [68]:
lr = LinearRegression().fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("mae:", mean_absolute_error(y_test, y_pred))
print("mse", mean_squared_error(y_test, y_pred))
print("rmse", mean_squared_error(y_test, y_pred, squared=False))

mae: 0.4993865973231894
mse 0.47503343916069596
rmse 0.6892266964944814


In [69]:
parameters = {'alpha':[0.01, 0.1, 1, 10, 50, 100]}
lasso = Lasso()
clf = GridSearchCV(lasso, parameters)
clf.fit(X_train, y_train)
print("Lasso best estimator: ", clf.best_estimator_)
print("Lasso best score: ", clf.best_score_)

Lasso best estimator:  Lasso(alpha=0.01)
Lasso best score:  0.5092259475116803


In [70]:
lasso = Lasso(alpha=0.01).fit(X_train, y_train)
y_pred = lasso.predict(X_test)
print("mae:", mean_absolute_error(y_test, y_pred))
print("mse", mean_squared_error(y_test, y_pred))
print("rmse", mean_squared_error(y_test, y_pred, squared=False))

mae: 0.4953952327980514
mse 0.47368504624752267
rmse 0.6882478087487984


Весь препроцессинг текста не помог( 