<a href="https://colab.research.google.com/github/Andrey22154/movie_recommendation_bot/blob/main/bot_movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [None]:
data = pd.read_parquet('/content/data4_filtered.parquet')

In [None]:
data1 = pd.read_parquet('/content/data5_filtered.parquet')

In [None]:
data3 = pd.read_parquet('/content/data.parquet')

In [None]:
data.rename(columns={'titleId': 'tconst'}, inplace=True)

In [None]:
merged_data = data.merge(data1, on='tconst').merge(data3, on='tconst')

In [None]:
merged_data.to_parquet('merged_data.parquet')

In [None]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81362 entries, 0 to 81361
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         81362 non-null  object 
 1   title          81362 non-null  object 
 2   region         81362 non-null  object 
 3   isAdult        81362 non-null  float64
 4   startYear      81361 non-null  float64
 5   genres         81362 non-null  object 
 6   averageRating  81362 non-null  float64
 7   numVotes       81362 non-null  int64  
dtypes: float64(3), int64(1), object(4)
memory usage: 5.6+ MB


In [None]:
merged_data = pd.read_parquet('/content/merged_data.parquet')

In [None]:
merged_data

Unnamed: 0,tconst,title,region,isAdult,startYear,genres,averageRating,numVotes
0,tt0000001,Карменсита,RU,0.0,1894,"Documentary,Short",5.7,2007
1,tt0000002,Клоун и его собаки,RU,0.0,1892,"Animation,Short",5.8,269
2,tt0000003,Бедный Пьеро,RU,0.0,1892,"Animation,Comedy,Romance",6.5,1912
3,tt0000004,Полная кружка пива,RU,0.0,1892,"Animation,Short",5.5,178
4,tt0000005,Сцена в кузне,RU,0.0,1893,"Comedy,Short",6.2,2692
...,...,...,...,...,...,...,...,...
81357,tt9909462,Nebyvalyi pohod,RU,0.0,1931,Documentary,6.7,19
81358,tt9910728,Жестокий Стамбул,RU,0.0,2019,Drama,5.8,1147
81359,tt9913936,Рай Диего,RU,0.0,2019,"Crime,Drama",7.4,62
81360,tt9914458,Пришелец Мессия,RU,0.0,2019,"Documentary,Sci-Fi",3.1,54


In [None]:
merged_data['startYear'] = pd.to_numeric(merged_data['startYear'], errors='coerce')

In [None]:
merged_data = merged_data[merged_data['startYear'] >= 1990]

In [None]:
merged_data = merged_data.dropna()

In [None]:
# Нормализация 'int_column'
scaler = MinMaxScaler()
columns_to_scale = ['startYear', 'numVotes']
merged_data[columns_to_scale] = scaler.fit_transform(merged_data[columns_to_scale])

In [None]:
# Стандартизация 'float_column'
standardizer = StandardScaler()
merged_data['averageRating'] = standardizer.fit_transform(merged_data[['averageRating']])

In [None]:
merged_data = merged_data.drop(['region', 'tconst'], axis = 1)

In [None]:
merged_data

Unnamed: 0,title,isAdult,startYear,genres,averageRating,numVotes
734,История гражданской войны,0.0,0.939394,Documentary,0.430935,0.000019
1166,Кейт и Лео,0.0,0.333333,"Comedy,Fantasy,Romance",0.138762,0.031204
1314,Перекресток Ларедо,0.0,0.151515,"Short,Western",-2.052532,0.000066
2289,Танго вдовца и его кривое зеркало,0.0,0.909091,Drama,0.138762,0.000063
2618,Другая сторона ветра,0.0,0.848485,Drama,0.357892,0.002792
...,...,...,...,...,...,...
81356,Let me kiss you... sir. Father of the bride,0.0,0.727273,Comedy,-1.102971,0.000002
81358,Жестокий Стамбул,0.0,0.878788,Drama,-0.299497,0.000405
81359,Рай Диего,0.0,0.878788,"Crime,Drama",0.869193,0.000020
81360,Пришелец Мессия,0.0,0.878788,"Documentary,Sci-Fi",-2.271661,0.000017


In [None]:
genres_list = merged_data['genres'].str.split(', ').explode()
unique_genres = genres_list.unique()

In [None]:
# Инициализация TfidfVectorizer
tfidf = TfidfVectorizer()

# Применение TF-IDF к столбцу 'genres'
tfidf_genres = tfidf.fit_transform(merged_data['genres'])

# Преобразование в DataFrame
tfidf_genres_df = pd.DataFrame(tfidf_genres.toarray(), columns=tfidf.get_feature_names_out())

# Если вы хотите заменить исходный столбец 'genres' новыми признаками
merged_data = pd.concat([merged_data.drop(columns=['genres']), tfidf_genres_df], axis=1)

In [None]:
merged_data = merged_data.dropna()

In [None]:
merged_data

Unnamed: 0,title,isAdult,startYear,averageRating,numVotes,action,adult,adventure,animation,biography,...,romance,sci,short,show,sport,talk,thriller,tv,war,western
734,История гражданской войны,0.0,0.939394,0.430935,0.000019,0.000000,0.0,0.0,0.0,0.0,...,0.00000,0.000000,0.590337,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1166,Кейт и Лео,0.0,0.333333,0.138762,0.031204,0.000000,0.0,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1314,Перекресток Ларедо,0.0,0.151515,-2.052532,0.000066,0.000000,0.0,0.0,0.0,0.0,...,0.51922,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2289,Танго вдовца и его кривое зеркало,0.0,0.909091,0.138762,0.000063,0.527203,0.0,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2618,Другая сторона ветра,0.0,0.848485,0.357892,0.002792,0.000000,0.0,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72149,Любовь с первого взгляда,0.0,0.818182,-0.153410,0.000678,0.000000,0.0,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72150,Потерявшийся в Лондоне,0.0,0.818182,-0.080367,0.001618,0.000000,0.0,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72151,Виво,0.0,0.939394,0.357892,0.007468,0.000000,0.0,0.0,0.0,0.0,...,0.00000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72152,Гедонист,0.0,0.818182,0.138762,0.000712,0.000000,0.0,0.0,0.0,0.0,...,0.00000,0.606153,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
