In [None]:
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from wordcloud import WordCloud

class AnimeDataProcessor:
    def __init__(self, file_path):
        self.df = pd.read_csv(file_path)
        self.tags_df = pd.DataFrame()
        self.warning_df = pd.DataFrame()
        self._clean_data()
        self._feature_engineering()

    def _clean_data(self):
        self.df.Type = self.df.Type.apply(lambda x: x.strip() if isinstance(x, str) else x)

    def _feature_engineering(self):
        def isNaN(num):
            return num != num

        self.df['warnings_count'] = self.df.Content_Warning.apply(lambda x: len(x.split(',,')) if not isNaN(x) else 0)
        self.df['rel_anim_count'] = self.df.Related_anime.apply(lambda x: len(x.split(',')) if not isNaN(x) else 0)
        self.df['rel_mang_count'] = self.df.Related_Mange.apply(lambda x: len(x.split(',')) if not isNaN(x) else 0)
        self.df['voice_act_count'] = self.df.Voice_actors.apply(lambda x: len(x.split(',')) if not isNaN(x) else 0)
        self.df['staff_count'] = self.df.staff.apply(lambda x: len(x.split(',')) if not isNaN(x) else 0)
        self.df['tags_count'] = self.df.Tags.apply(lambda x: len(x.split(',')) if not isNaN(x) else 0)
        self.df['rel_media_count'] = self.df.rel_anim_count + self.df.rel_mang_count

        self.df.Content_Warning = self.df.Content_Warning.apply(lambda x: x.split(',,') if not isNaN(x) else np.nan)
        self.df.Tags = self.df.Tags.apply(lambda x: x.split(',') if not isNaN(x) else np.nan)

        self.df.drop(['Related_Mange', 'Related_anime', 'Voice_actors', 'staff', 'End_year'], axis=1, inplace=True)

    def visualize_data(self):
        cols_for_bar = ['Release_season', 'Type']

        for col in cols_for_bar:
            plt.figure(figsize=(15, 15))
            sns.barplot(y=self.df[col], x=self.df[col].index, ci=None)
            plt.title('Распределение {}'.format(col), size=20)
            plt.show()

        plt.figure(figsize=(15, 7))
        sns.displot(self.df.Release_year, height=7)
        plt.title('Распределение года выпуска', size=18)
        plt.show()

        plt.figure(figsize=(15, 7))
        sns.displot(self.df.Rating, height=7, kde=True)
        plt.title("Распределение рейтинга", size=18)
        plt.show()

        top_studios = self.df.Studio.value_counts()[:15]
        plt.figure(figsize=(15, 7))
        sns.barplot(y=top_studios.index, x=top_studios, ci=False)
        plt.title('Количество произведенных студиями аниме', size=20)
        plt.show()

        studios_more_20 = list(self.df.Studio.value_counts().index[:111])
        top_studios_rtg = self.df[self.df.Studio.isin(studios_more_20)].groupby('Studio').Rating.mean().sort_values(
            ascending=False)[:15]
        plt.figure(figsize=(15, 7))
        sns.barplot(y=top_studios_rtg.index, x=top_studios_rtg.values).set_title('Лучшие студии по рейтингу с более чем 20 аниме', size=20)
        plt.show()

        plt.figure(figsize=(15, 7))
        sns.boxplot(x=self.df.Type, y=self.df.Rating, palette='mako')
        plt.title('Влияние типа аниме на рейтинг', size=20)
        plt.show()

        plt.figure(figsize=(15, 7))
        data_for_box = self.df[self.df.Type.isin(['TV', 'Web'])]
        sns.boxplot(x='Type', y='Rating', hue='Release_season', data=data_for_box, palette='mako')
        plt.title("Влияние сезона выпуска на рейтинг TV-шоу и веб-сериалов", size=20)
        plt.show()

        plt.figure(figsize=(15, 15))
        num_cols = ['Rating', 'Episodes', 'Release_year', 'warnings_count', 'rel_anim_count', 'rel_mang_count',
                    'voice_act_count', 'staff_count', 'tags_count', 'rel_media_count']
        sns.heatmap(data=self.df[num_cols].corr(), annot=True, fmt='0.3f', cmap='crest')

        df_piv = pd.DataFrame({'Rating': self.df[num_cols].corr().iloc[0]},
                              index=self.df[num_cols].corr().iloc[0].index)
        rating_corr = self.df[num_cols].corr().iloc[0]
        plt.figure(figsize=(15, 1))
        df_piv = pd.DataFrame({cname: rating_corr[rating_corr.index == cname].iloc[0] for cname in rating_corr.index},
                              index=['Rating'])

        sns.heatmap(df_piv, annot=True, fmt='0.3f', cmap='crest')
        plt.xticks(rotation=35)

        all_text = ''.join([desc for desc in list(self.df.Description) if isinstance(desc, str)])
        wordcloud = WordCloud(
            background_color='white',
            collocations=False,
            colormap='winter_r'
        )
        wordcloud.generate(all_text)

        plt.figure(figsize=(15, 7))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.show()

        web_text = ''.join([desc for desc in list(self.df[self.df.Type == 'Web'].Description) if isinstance(desc, str)])
        TV_text = ''.join([desc for desc in list(self.df[self.df.Type == 'TV'].Description) if isinstance(desc, str)])

        fig, axes = plt.subplots(1, 2, figsize=(20, 10))

        for ax, text, name in zip(axes, [web_text, TV_text], ['Web', 'TV']):
            wc = WordCloud(background_color='white').generate(text)
            ax.imshow(wc)
            ax.axis('off')
            ax.set_title(name, size=30)

        self.df.drop('Description', axis=1, inplace=True)

    def process_data(self):
        all_tags = []

        for i in range(len(self.df)):
            if not self.isNaN(self.df.loc[i, 'Tags']):
                all_tags += self.df.loc[i, 'Tags']

        print('Количество уникальных тегов: {}'.format(len(set(all_tags))))

        from collections import Counter

        tags_df = pd.DataFrame()

        first_n_tags = 200
        viable_tags = [tag for tag, value in Counter(all_tags).most_common(first_n_tags)]

        for i in range(len(self.df)):
            tags = self.df.loc[i, 'Tags']

            if not self.isNaN(tags):
                tags_dict = {tag: 1 for tag in tags if tag in viable_tags}
                tags_df = pd.concat([tags_df, pd.Series(tags_dict, name=i)], axis=1)

        tags_df = tags_df.T.fillna(0)

        warning_df = pd.DataFrame()

        all_warnings = []
        for i in range(len(self.df)):
            if not self.isNaN(self.df.loc[i, 'Content_Warning']):
                all_warnings += self.df.loc[i, 'Content_Warning']
        warnings = list(set(all_warnings))

        for i in range(len(self.df)):
            warns = self.df.loc[i, 'Content_Warning']

            if not self.isNaN(warns):
                warning_dict = {'cw_' + warning: 1 for warning in warns if warning in warnings}
                warning_df = pd.concat([warning_df, pd.Series(warning_dict, name=i)], axis=1)

        warning_df = warning_df.T.fillna(0)

        self.df = pd.concat([self.df, tags_df, warning_df], axis=1)
        self.df.dropna(subset=['Rating'], inplace=True)

    def train_model(self):
        features = ['Type', 'Episodes', 'Release_year', 'Release_season', 'Studio', 'warnings_count', 'rel_anim_count',
                    'rel_mang_count', 'voice_act_count',
                    'rel_media_count', 'staff_count', 'tags_count'] + list(self.tags_df.columns) + list(self.warning_df.columns)
        X = self.df[features]
        y = self.df.Rating

        X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=1)
        categorical_cols = ['Type', 'Release_season', 'Studio']
        numerical_cols = ['Episodes', 'Release_year', 'warnings_count', 'rel_media_count', 'staff_count',
                          'tags_count', 'rel_anim_count',
                          'rel_mang_count', 'voice_act_count'] + list(self.tags_df.columns) + list(self.warning_df.columns)

        numerical_transformer = SimpleImputer(strategy='most_frequent')

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        preprocessor = ColumnTransformer(transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

        model = XGBRegressor(
            learning_rate=0.05,
            n_estimators=300,
            max_depth=6,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            objective='reg:squarederror',
            n_jobs=-1,
            random_state=1
        )

        my_pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ])

        my_pipeline.fit(X_train, y_train)
        preds = my_pipeline.predict(X_valid)
        mae = mean_absolute_error(y_valid, preds)
        print('Средняя абсолютная ошибка: {}'.format(mae))

        for i in range(20):
            print('Предсказано: {:.2f};\n     Реальное: {};\n'.format(preds[i], y_valid.iloc[i]))

        plt.figure(figsize=(10, 7))
        plt.scatter(y_valid, preds)
        plt.plot([y_valid.min(), y_valid.max()], [y_valid.min(), y_valid.max()], "k--", color="red")
        plt.xlabel('Фактические', weight="bold")
        plt.ylabel('Прогнозируемые', weight="bold")
        plt.grid(color='black', linestyle='--', linewidth=0.2)
        plt.title('Фактические значения vs прогнозируемые значения')
        plt.show()

        mse_rf = mean_squared_error(y_valid, preds)
        print("RMSE: {:.4f} $ ".format(np.sqrt(mse_rf)))
        print("R2: {:.2f} %".format(r2_score(y_valid, preds) * 100))

    def isNaN(self, num):
        return num != num



In [None]:
url = "https://docs.google.com/spreadsheets/d/e/2PACX-1vTYZ6SWtz_1sqMZm-ClUdWHgRON7x1atN5DXdFj1xFuBKdSzls7ZzgRxWL554ohQD4fXGjQg_cCrL-T/pub?output=csv"

In [None]:
anime_processor = AnimeDataProcessor(url)

In [None]:
anime_processor.visualize_data()

In [None]:
anime_processor.process_data()
anime_processor.train_model()