In [1]:
import os

In [2]:
%pwd

'/Users/anjalijha/Python/Project/YouTubeChannel-Analyzer/research'

In [3]:
os.chdir("../")

In [68]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen= True)
class DataAnalysisConfig():
    root_dir: Path
    data_dir : Path
    csv_dir : Path

In [69]:
from YouTubeChannelAnalyzer.constants import *
from YouTubeChannelAnalyzer.utils.common import create_directories, read_yaml
import pandas as pd
from datetime import datetime, timezone  # Ensure timezone is imported
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression, chi2
from sklearn.ensemble import RandomForestRegressor
# warnings.filterwarnings("ignore")

In [70]:
class ConfigurationManager():
    def __init__(self, config_filepath = CONFIG_FILE_PATH, params_filepath = PARAMS_FILE_PATH):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        # Create necessary directories
        create_directories([self.config.artifacts_root])

    
    def get_dataanalysis_config(self) -> DataAnalysisConfig:
        config = self.config.data_analysis  # Fetching the model_trainer section
        create_directories([config.root_dir])
        dataanalysis_config = DataAnalysisConfig(
            root_dir=config.root_dir,
            data_dir=config.data_dir,
            csv_dir = config.csv_dir,
        )
        return dataanalysis_config

In [73]:
class ExploratoryDataAnalysis:

    def __init__(self, config : ConfigurationManager):
        self.config = config

    def data_analysis(self):
        try:
            # Fetch the data
            df_data = pd.read_csv(self.config.data_dir + "Raw_Youtube_API_DATA.csv")
            df_data['total_views'] = pd.to_numeric(df_data['total_views'], errors="coerce")
            df_data['total_likes'] = pd.to_numeric(df_data['total_likes'], errors="coerce")
            df_data['total_comments'] = pd.to_numeric(df_data['total_comments'], errors="coerce")
            df_data['total_subscribers'] = pd.to_numeric(df_data['total_subscribers'], errors="coerce")
            df_data['total_no_of_videos'] = pd.to_numeric(df_data['total_no_of_videos'], errors="coerce")
            df_data['total_no_short_videos'] = pd.to_numeric(df_data['total_no_short_videos'], errors="coerce")
            df_data['total_no_long_videos'] = pd.to_numeric(df_data['total_no_long_videos'], errors="coerce")


            df_data['channel_start_date'] = pd.to_datetime(df_data['channel_start_date'], errors="coerce")
            df_data['inception_date'] = pd.to_datetime(df_data['inception_date'], errors="coerce")

            #check for null values
            # print(df_data.isnull().sum())


            reference_date = pd.to_datetime(pd.Timestamp.now()).tz_localize('UTC')
            # Convert the channel start date and inception date to UTC
            df_data['channel_start_date'] = pd.to_datetime(df_data['channel_start_date']).dt.tz_localize('UTC')
            df_data['inception_date'] = pd.to_datetime(df_data['inception_date']).dt.tz_localize('UTC')
            # Now calculate days since start and inception
            df_data['days_since_start'] = (reference_date - df_data['channel_start_date']).dt.days
            df_data['days_since_inception'] = (reference_date - df_data['inception_date']).dt.days
            final_df = df_data.drop(['channel_id', 'channel_name', 'channel_start_date', 'inception_date'], axis=1)

            final_df.to_csv(self.config.csv_dir + "Youtube_channel_data.csv", index= False)
                        
        except Exception as e:
            print(f"An error occurred during model training: {e}")

In [74]:
try:
    config_manager = ConfigurationManager()
    eda_config = config_manager.get_dataanalysis_config()
    eda = ExploratoryDataAnalysis(config = eda_config)
    eda.data_analysis()
except Exception as e:
    print(f"An error occurred: {e}")

[2025-01-19 22:46:40,989: INFO: common: yaml file: config/config.yaml loaded successfully]
[2025-01-19 22:46:40,991: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-19 22:46:40,992: INFO: common: created directory at: artifacts]
[2025-01-19 22:46:40,993: INFO: common: created directory at: artifacts/data_analysis]
