In [1]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import re


In [2]:
from IPython.core.interactiveshell import InteractiveShell
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import nltk


In [3]:
np.random.seed = 0
nltk.download('punkt')
InteractiveShell.ast_node_interactivity = "all"
nltk.download('stopwords')
np.set_printoptions(precision= 3)
pd.set_option("display.max_columns", None)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fourz\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fourz\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
data = "C:/Users/fourz/Downloads/TV_Series.csv"
df = pd.read_csv(data)
df.sample(5)

Unnamed: 0,Series Title,Release Year,Runtime,Genre,Rating,Cast,Synopsis
4728,Rise,(2018),43 min,Drama,6.8,"Josh Radnor, Marley Shelton, Auli'i Cravalho, ...",A working class high school drama department a...
33533,The Glory,(2022– ),50 min,Drama,8.1,"Song Hye-Kyo, Lee Do-Hyun, Ji-Yeon Lim, Aria Song",A woman lives for absolute revenge against her...
47361,The Walking Dead,(2010–2022),44 min,"Drama, Horror, Thriller",8.1,"Andrew Lincoln, Norman Reedus, Melissa McBride...",Sheriff Deputy Rick Grimes wakes up from a com...
6739,Sigmund and the Sea Monsters,(1973–1975),30 min,"Comedy, Family",6.9,"Johnny Whitaker, Scott C. Kolden, Billy Barty,...",Two boys try to maintain normal lives while se...
3988,Where in the World Is Carmen Sandiego?,(1991–1996),30 min,"Comedy, Family, Game-Show",7.6,"Greg Lee, Lynne Thigpen, Rockapella, Sean Altman",Contestants pursue the international thief as ...


## Пробработка данных

In [12]:
class Preprocessor:
    
    """
    Класс для предварительной обработки набора данных о телесериалах.

    Аргументы:
        df (pandas.DataFrame): Набор данных, которые требуется предварительно обработать.

    Атрибуты:
        pattern_remove (str): Регулярное выражение.
        df (pandas.DataFrame): Предварительно обработанный набор данных.
        pattern_text (str): Регулярное выражение смайлов.
        stop_words (set): Множество стоп-слов.

    Методы:
        clean_df(self): Очищает набор данных.
        normalization(self): Нормализует набор данных.
        one_hot_df(self): Преобразует столбец с жанрами в формат ONE_hot_encoding.
    """
    
    def __init__(self, df: pd.DataFrame) -> None:
        self.pattern_remove = r"\((\d+)\)"
        self.df = df
        self.pattern_text = '(?::|;|=)(?:-)?(?:\)|\(|D|P)'
        self.stop_words = set(stopwords.words('english'))
    
    def clean_df(self, df: pd.DataFrame) -> pd.DataFrame:   
        """
        Очищает набор данных.

        Аргументы:
            df (pandas.DataFrame): Набор данных, который требуется очистить.

        Возвращает:
            pandas.DataFrame: Очищенный набор данных.
        """
        df.rename(columns=lambda col_: col_.lower().replace(' ', '_'), inplace=True)
        df["rating"] = pd.to_numeric(df["rating"].replace("****", np.nan))
    
        df["runtime"] = df["runtime"].replace("****", '').str.extract(r"(\d+) min")
    
        df["end_year"] = df["release_year"].str.extract(r"-?(\d+)\)")
        
        df["release_year"] = df["release_year"].str.extract(r"\((\d+)–?")
    
        df["synopsis"] = df["synopsis"].astype('str')
              
        return df
    
    
    def normalization(self, df: pd.DataFrame) -> pd.DataFrame:
        
        """
        Нормализует набор данных.

        Аргументы:
            df (pandas.DataFrame): Набор данных, который требуется нормализовать.

        Возвращает:
            pandas.DataFrame: Нормализованный набор данных.
        """
        
        df["synopsis1"] = df["synopsis"].map(lambda x: re.sub('<[^>]*>', '', x))
        df["synopsis1"] = df["synopsis1"].map(lambda x: re.sub(r"[^\w\s_]", '', x.lower()))
        df["synopsis1"] = df["synopsis"].map(lambda x: ' '.join([word for word in x.split() if word not in self.stop_words]))
        df["synopsis1"] = df["synopsis1"].map(lambda x: re.sub(self.pattern_remove, '', x))
    
        return df
    
    def one_hot_df(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Преобразует столбец с жанрами в формат с одним горячим кодированием.

        Аргументы:
            df (pandas.DataFrame): Набор данных, который требуется преобразовать.

        Возвращает:
            pandas.DataFrame: Набор данных с преобразованным столбцом жанров в формат с одним горячим кодированием.
        """  
        one_hot = pd.get_dummies(df['genre'].str.split(', ', expand=True).stack()).groupby(level=0).sum()
        one_hot = one_hot.add_prefix('genre_')
    
        final_df = df.join(one_hot)
    
        final_df.drop(columns=['genre_****'], inplace=True)
        final_df["synopsis_tokens"] = final_df["synopsis"].apply(word_tokenize)
    
        return  final_df
    
preprocessor = Preprocessor(df)


final_df = (
        preprocessor.df\
        .pipe(preprocessor.clean_df)\
        .pipe(preprocessor.normalization)
)


# final_df.to_csv("C:/Users/fourz/OneDrive/Рабочий стол/Word2Vec/filter_data.csv")
final_df.sample(5)

Unnamed: 0,series_title,release_year,runtime,genre,rating,cast,synopsis,end_year,synopsis1
9131,Deathstroke: Knights & Dragons,2020,92.0,"Animation, Action, Adventure",6.6,"Michael Chiklis, Sasha Alexander, Asher Bishop...","Ten years ago, Slade Wilson-aka the super-assa...",2020.0,"Ten years ago, Slade Wilson-aka super-assassin..."
44082,National Treasure: Edge of History,2022,50.0,"Action, Adventure, Mystery",5.0,"Lisette Olivera, Zuri Reed, Catherine Zeta-Jon...","Jess Valenzuela, a 20-year-old Dreamer, sets o...",,"Jess Valenzuela, 20-year-old Dreamer, sets exp..."
8689,Secret Agent Man,2000,86.0,"Action, Thriller",6.0,"Costas Mandylor, Dina Meyer, Dondré T. Whitfie...","Suave spy Monk and his gorgeous partner, Holid...",2000.0,"Suave spy Monk gorgeous partner, Holiday, work..."
2819,Yahari ore no seishun rabukome wa machigatteiru.,2013,24.0,"Animation, Comedy, Drama",8.0,"Cat Thomas, Melissa Molano, Takuya Eguchi, Sao...",About an antisocial high school student named ...,2020.0,About antisocial high school student named Hik...
35158,Willow,2022,,"Action, Adventure, Drama",5.3,"Ruby Cruz, Ellie Bamber, Erin Kellyman, Tony R...",20 years after vanquishing the wicked queen Ba...,,"20 years vanquishing wicked queen Bavmorda, so..."


In [13]:
# year_counts = final_df['release_year'].value_counts().sort_index(ascending=True)

# plt.figure(figsize=(15, 17))
# sns.barplot(y=year_counts.index, x=year_counts.values, orient='h')


# plt.xticks(rotation=0)
# plt.yticks(rotation=30)
# plt.ylabel('Год выпуска')
# plt.xlabel('Количество сериалов')
# plt.title('Количество сериалов по годам выпуска')

# plt.show();
