In [1]:
# импорты необходимых библиотек
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
# %matplotlib inline

import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from nltk.tokenize import word_tokenize, wordpunct_tokenize
import nltk
nltk.download('punkt')
from sklearn.model_selection import GridSearchCV
     

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data = pd.read_csv('IMDB-Movie-Data.csv')
print(data.shape)

data.head(3)

(1000, 12)


Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0


In [3]:
#Превращаем NaN в средние значения по колонке

# вычисляем средние для колонок с пустыми значениями
meta_mean = data.Metascore.mean()
rev_mean = data['Revenue (Millions)'].mean()

#заменяем пустоты на средние значения
data.Metascore.fillna(meta_mean, inplace=True)
data['Revenue (Millions)'].fillna(rev_mean, inplace=True)

# проверяем присутствие NaN
data.isna().any()

Rank                  False
Title                 False
Genre                 False
Description           False
Director              False
Actors                False
Year                  False
Runtime (Minutes)     False
Rating                False
Votes                 False
Revenue (Millions)    False
Metascore             False
dtype: bool

## Подготовка данных

Попробуем предсказывать рейтинг фильма по данным его описания, года, длины в минутах и кассовых сборов

Колонка "Rating" станет **целевой переменной, или таргетом** (y)<br>
Остальных данные будут **обучающей выборкой** (X)

In [4]:
data.Description

0      A group of intergalactic criminals are forced ...
1      Following clues to the origin of mankind, a te...
2      Three girls are kidnapped by a man with a diag...
3      In a city of humanoid animals, a hustling thea...
4      A secret government agency recruits some of th...
                             ...                        
995    A tight-knit team of rising investigators, alo...
996    Three American college students studying abroa...
997    Romantic sparks occur between two dance studen...
998    A pair of friends embark on a mission to reuni...
999    A stuffy businessman finds himself trapped ins...
Name: Description, Length: 1000, dtype: object

In [5]:
# подготовим описания фильмов
data["text"] = data.Description.apply(lambda x: word_tokenize(x)) 

data["text"]

0      [A, group, of, intergalactic, criminals, are, ...
1      [Following, clues, to, the, origin, of, mankin...
2      [Three, girls, are, kidnapped, by, a, man, wit...
3      [In, a, city, of, humanoid, animals, ,, a, hus...
4      [A, secret, government, agency, recruits, some...
                             ...                        
995    [A, tight-knit, team, of, rising, investigator...
996    [Three, American, college, students, studying,...
997    [Romantic, sparks, occur, between, two, dance,...
998    [A, pair, of, friends, embark, on, a, mission,...
999    [A, stuffy, businessman, finds, himself, trapp...
Name: text, Length: 1000, dtype: object

In [None]:
data.text.values

In [7]:
input_text = list(data.text.values)

In [8]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(input_text)]
documents[10:12]

[TaggedDocument(words=['The', 'adventures', 'of', 'writer', 'Newt', 'Scamander', 'in', 'New', 'York', "'s", 'secret', 'community', 'of', 'witches', 'and', 'wizards', 'seventy', 'years', 'before', 'Harry', 'Potter', 'reads', 'his', 'book', 'in', 'school', '.'], tags=[10]),
 TaggedDocument(words=['The', 'story', 'of', 'a', 'team', 'of', 'female', 'African-American', 'mathematicians', 'who', 'served', 'a', 'vital', 'role', 'in', 'NASA', 'during', 'the', 'early', 'years', 'of', 'the', 'U.S.', 'space', 'program', '.'], tags=[11])]

Обучаем модель на текстах описаний фильмов (можно поизменять параметры)

In [9]:
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)



In [10]:
model.save("D2V.model") # сохранение модели

In [11]:
# так можно посмотреть на векторы текстов, на которых училась модель
# индекс [] около documents -- это индекс текста из датасета

model[documents[0].tags[0]]


array([ 0.06857061,  0.0276906 , -0.0460314 , -0.00388521, -0.14361057],
      dtype=float32)

Теперь нужно добавить векторы в датасет с остальными параметрами

In [12]:
# создадим список с векторами для каждого текста
vectors = []
for x in documents:
    vec = list(model[x.tags][0])
    vectors.append(vec)

In [13]:
# так получим датафрейм, где все компоненты векторов в отдельных столбцах
split_df = pd.DataFrame(vectors,
                        columns=['v1', 'v2', 'v3','v4',"v5"])

split_df


Unnamed: 0,v1,v2,v3,v4,v5
0,0.068571,0.027691,-0.046031,-0.003885,-0.143611
1,-0.006890,-0.001286,-0.037979,-0.188008,-0.045070
2,0.167416,-0.056082,-0.037573,-0.042474,-0.176787
3,0.209401,-0.125601,-0.174734,-0.257231,-0.255892
4,0.116167,-0.066425,-0.085169,-0.273545,-0.054744
...,...,...,...,...,...
995,0.112158,-0.114053,0.003931,-0.068885,-0.087671
996,0.121931,-0.005672,-0.000910,-0.134893,-0.120307
997,0.092240,0.027331,0.036061,-0.072165,-0.019601
998,-0.030691,0.069846,-0.055414,-0.134857,0.024631


In [14]:
# теперь добавим его к основному датафрейму
result = data.join(split_df, how='left')
result.shape

(1000, 18)

In [15]:
result

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,text,v1,v2,v3,v4,v5
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.130000,76.0,"[A, group, of, intergalactic, criminals, are, ...",0.068571,0.027691,-0.046031,-0.003885,-0.143611
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.460000,65.0,"[Following, clues, to, the, origin, of, mankin...",-0.006890,-0.001286,-0.037979,-0.188008,-0.045070
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.120000,62.0,"[Three, girls, are, kidnapped, by, a, man, wit...",0.167416,-0.056082,-0.037573,-0.042474,-0.176787
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.320000,59.0,"[In, a, city, of, humanoid, animals, ,, a, hus...",0.209401,-0.125601,-0.174734,-0.257231,-0.255892
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.020000,40.0,"[A, secret, government, agency, recruits, some...",0.116167,-0.066425,-0.085169,-0.273545,-0.054744
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,Secret in Their Eyes,"Crime,Drama,Mystery","A tight-knit team of rising investigators, alo...",Billy Ray,"Chiwetel Ejiofor, Nicole Kidman, Julia Roberts...",2015,111,6.2,27585,82.956376,45.0,"[A, tight-knit, team, of, rising, investigator...",0.112158,-0.114053,0.003931,-0.068885,-0.087671
996,997,Hostel: Part II,Horror,Three American college students studying abroa...,Eli Roth,"Lauren German, Heather Matarazzo, Bijou Philli...",2007,94,5.5,73152,17.540000,46.0,"[Three, American, college, students, studying,...",0.121931,-0.005672,-0.000910,-0.134893,-0.120307
997,998,Step Up 2: The Streets,"Drama,Music,Romance",Romantic sparks occur between two dance studen...,Jon M. Chu,"Robert Hoffman, Briana Evigan, Cassie Ventura,...",2008,98,6.2,70699,58.010000,50.0,"[Romantic, sparks, occur, between, two, dance,...",0.092240,0.027331,0.036061,-0.072165,-0.019601
998,999,Search Party,"Adventure,Comedy",A pair of friends embark on a mission to reuni...,Scot Armstrong,"Adam Pally, T.J. Miller, Thomas Middleditch,Sh...",2014,93,5.6,4881,82.956376,22.0,"[A, pair, of, friends, embark, on, a, mission,...",-0.030691,0.069846,-0.055414,-0.134857,0.024631


In [16]:
# переопределим датасет, оставив только важное

data_sm = result[['Runtime (Minutes)',"Year",
                'Rating', 'Votes',
                'Revenue (Millions)','Metascore',"v1","v2","v3","v4","v5"]
              ]


data_sm.head(3)

Unnamed: 0,Runtime (Minutes),Year,Rating,Votes,Revenue (Millions),Metascore,v1,v2,v3,v4,v5
0,121,2014,8.1,757074,333.13,76.0,0.068571,0.027691,-0.046031,-0.003885,-0.143611
1,124,2012,7.0,485820,126.46,65.0,-0.00689,-0.001286,-0.037979,-0.188008,-0.04507
2,117,2016,7.3,157606,138.12,62.0,0.167416,-0.056082,-0.037573,-0.042474,-0.176787


## Подготавливаем матрицы

In [17]:
# определяем X и y

X = data_sm.drop(["Rating"],axis=1).values 

display(X, X.shape)

array([[ 1.21000000e+02,  2.01400000e+03,  7.57074000e+05, ...,
        -4.60313968e-02, -3.88521049e-03, -1.43610567e-01],
       [ 1.24000000e+02,  2.01200000e+03,  4.85820000e+05, ...,
        -3.79789770e-02, -1.88008219e-01, -4.50701714e-02],
       [ 1.17000000e+02,  2.01600000e+03,  1.57606000e+05, ...,
        -3.75726707e-02, -4.24740762e-02, -1.76786885e-01],
       ...,
       [ 9.80000000e+01,  2.00800000e+03,  7.06990000e+04, ...,
         3.60606015e-02, -7.21652433e-02, -1.96013190e-02],
       [ 9.30000000e+01,  2.01400000e+03,  4.88100000e+03, ...,
        -5.54143377e-02, -1.34856567e-01,  2.46305019e-02],
       [ 8.70000000e+01,  2.01600000e+03,  1.24350000e+04, ...,
         5.71736251e-04, -3.79508696e-02, -2.17426792e-02]])

(1000, 10)

In [18]:
data_sm.isna().any()

Runtime (Minutes)     False
Year                  False
Rating                False
Votes                 False
Revenue (Millions)    False
Metascore             False
v1                    False
v2                    False
v3                    False
v4                    False
v5                    False
dtype: bool

In [19]:
y = data_sm['Rating'].values # отдельно вынесли массив со значениями скорости ветра
y.shape

(1000,)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# **Модели линейной регрессии**

## **Linear**

In [21]:
regressor_linear = LinearRegression()
regressor_linear.fit(X_train, y_train)

y_preds_linear = regressor_linear.predict(X_test)

In [22]:
print(mean_absolute_error(y_test, y_preds_linear))
print(mean_squared_error(y_test, y_preds_linear))
print(mean_squared_error(y_test, y_preds_linear, squared=False))

0.4858798599509143
0.45343026500915634
0.6733723078722174


## **Ridge**

In [23]:
# задаем модель регрессора
# силу регуляризации можно варьировать параметром alpha
regressor_ridge = Ridge() 
regressor_ridge.fit(X_train, y_train)

y_preds_ridge = regressor_ridge.predict(X_test)

In [24]:
print(mean_absolute_error(y_test, y_preds_ridge)) 
print(mean_squared_error(y_test, y_preds_ridge))
print(mean_squared_error(y_test, y_preds_ridge, squared=False))

0.48479647607115917
0.45053468946627545
0.671218808933626


In [25]:
#Изменяем гиперпараметры
grid = {
    'alpha' : [0.001, 0.01, 0.1, 1, 3, 5, 7, 10, 20, 25, 27, 30, 33, 37, 40, 50, 100, 1000]
}

grid = GridSearchCV(
    Ridge(), grid
).fit(X_train, y_train)
print("Best params:", grid.best_params_)

y_preds_ridge = grid.predict(X_test)

Best params: {'alpha': 1}


In [26]:
print(mean_absolute_error(y_test, y_preds_ridge)) 
print(mean_squared_error(y_test, y_preds_ridge))
print(mean_squared_error(y_test, y_preds_ridge, squared=False))

0.48479647607115917
0.45053468946627545
0.671218808933626


## **Lasso**

In [27]:
regressor_lasso = Lasso()
regressor_lasso.fit(X_train, y_train)

y_preds_lasso = regressor_lasso.predict(X_test)

In [28]:
print(mean_absolute_error(y_test, y_preds_lasso)) 
print(mean_squared_error(y_test, y_preds_lasso))
print(mean_squared_error(y_test, y_preds_lasso, squared=False))

0.48710026839643117
0.4558790040030374
0.6751881248978223


In [29]:
#Изменяем гиперпараметры
grid = {
    'alpha' : [0.001, 0.009, 0.01, 0.013, 0.015, 0.017, 0.02, 0.05, 0.07, 0.1, 1, 10, 100, 1000]
}

grid = GridSearchCV(
    Lasso(), grid
).fit(X_train, y_train)
print("Best params:", grid.best_params_)

y_preds_lasso = grid.predict(X_test)

Best params: {'alpha': 0.001}


In [30]:
print(mean_absolute_error(y_test, y_preds_lasso)) 
print(mean_squared_error(y_test, y_preds_lasso))
print(mean_squared_error(y_test, y_preds_lasso, squared=False))

0.48475810404166936
0.4500357423040647
0.6708470334614774


До нормализации:

In [31]:
list(X[0])

[121.0,
 2014.0,
 757074.0,
 333.13,
 76.0,
 0.06857060641050339,
 0.027690596878528595,
 -0.046031396836042404,
 -0.0038852104917168617,
 -0.14361056685447693]

In [32]:
# использзуем стандартизатор
sc = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(sc.fit_transform(X), y, random_state=42)

После:

In [33]:
list(sc.fit_transform(X)[0])

[0.4163497512303056,
 0.37979525138136244,
 3.1126899627963738,
 2.5961363010556906,
 1.0233613578368184,
 -0.8079010705813297,
 0.9261981897369452,
 0.44304708897284145,
 1.769170023309538,
 -0.04031433689921781]

Linear Regression

In [34]:
regressor_linear = LinearRegression()
regressor_linear.fit(X_train, y_train)

y_preds_linear = regressor_linear.predict(X_test)

print(mean_absolute_error(y_test, y_preds_linear))
print(mean_squared_error(y_test, y_preds_linear))
print(mean_squared_error(y_test, y_preds_linear, squared=False))

0.48814671863781073
0.47106495921584574
0.686341721896495


Ridge

In [35]:
regressor_ridge = Ridge() 
regressor_ridge.fit(X_train, y_train)

y_preds_ridge = regressor_ridge.predict(X_test)

print(mean_absolute_error(y_test, y_preds_ridge)) 
print(mean_squared_error(y_test, y_preds_ridge))
print(mean_squared_error(y_test, y_preds_ridge, squared=False))

0.4881444104870322
0.4711160312655101
0.686378926880415


In [36]:
grid = {
    'alpha' : [0.001, 0.01, 0.1, 1, 3, 5, 7, 10, 20, 25, 27, 30, 33, 37, 40, 50, 100, 1000]
}

grid = GridSearchCV(
    Ridge(), grid
).fit(X_train, y_train)
print("Best params:", grid.best_params_)

y_preds_ridge = grid.predict(X_test)

print(mean_absolute_error(y_test, y_preds_ridge)) 
print(mean_squared_error(y_test, y_preds_ridge))
print(mean_squared_error(y_test, y_preds_ridge, squared=False))

Best params: {'alpha': 20}
0.4882202196004303
0.4723111507045687
0.6872489728654155


Lasso

In [37]:
regressor_lasso = Lasso()
regressor_lasso.fit(X_train, y_train)

y_preds_lasso = regressor_lasso.predict(X_test)

print(mean_absolute_error(y_test, y_preds_lasso)) 
print(mean_squared_error(y_test, y_preds_lasso))
print(mean_squared_error(y_test, y_preds_lasso, squared=False))

0.7736000000000001
0.9734313244444442
0.9866262334057636


In [38]:
grid = {
    'alpha' : [0.001, 0.009, 0.01, 0.013, 0.015, 0.017, 0.02, 0.05, 0.07, 0.1, 1, 10, 100, 1000]
}

grid = GridSearchCV(
    Lasso(), grid
).fit(X_train, y_train)
print("Best params:", grid.best_params_)

y_preds_lasso = grid.predict(X_test)

print(mean_absolute_error(y_test, y_preds_lasso)) 
print(mean_squared_error(y_test, y_preds_lasso))
print(mean_squared_error(y_test, y_preds_lasso, squared=False))

Best params: {'alpha': 0.009}
0.48627235050649836
0.46948357826614456
0.6851887172641888


**Изменение гиперпараметра (параметра регуляризации alpha) никак не повлиял на метрики (MSE/RMSE/MAE) предсказательной силы моделей. Также как и токинизация и нормализация. Только в случае Lasso метрики были выше с нормализацией, но после подбора гиперпараметра снизились.**

Lection metrics(Ridge):

MAE - 0.48
MSE - 0.46
RMSE - 0.68

My metrics(Токенизация, без нормализации):

Linear regression
MAE - 0.48
MSE - 0.45
RMSE - 0.67 

Ridge
MAE - 0.48
MSE - 0.45
RMSE - 0.67

(Change alpha)
MAE - 0.48
MSE - 0.44
RMSE - 0.67 

Lasso
MAE - 0.49
MSE - 0.46
RMSE - 0.68

(Change alpha)
MAE - 0.48
MSE - 0.44
RMSE - 0.66 

My metrics(Токенизация, c нормализацией):

Linear regression
MAE - 0.49
MSE - 0.47
RMSE - 0.68

Ridge
MAE - 0.49
MSE - 0.47
RMSE - 0.68

(Change alpha)
MAE - 0.49
MSE - 0.47
RMSE - 0.68 

Lasso
MAE - 0.77
MSE - 0.97
RMSE - 0.99

(Change alpha)
MAE - 0.49
MSE - 0.47
RMSE - 0.68 