In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'/Users/anjalijha/Python/Project/YouTubeChannel-Analyzer'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen= True)
class ModelTrainerconfig:
    root_dir : Path
    data_dir : Path
    test_size: float
    random_state_size: int
    n_estimators: int
    random_state : int

In [6]:
from YouTubeChannelAnalyzer.constants import *
from YouTubeChannelAnalyzer.utils.common import create_directories, read_yaml
from YouTubeChannelAnalyzer.logging import logger
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.feature_selection import SelectKBest, f_regression, chi2
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

In [7]:

class ConfigurationManager:
    def __init__(self, config_filepath=CONFIG_FILE_PATH, params_filepath=PARAMS_FILE_PATH):
        try:
            # Read YAML files into a ConfigBox (or dictionary-like) object
            self.config = read_yaml(config_filepath)
            self.params = read_yaml(params_filepath)

            # Create necessary directories from the config
            create_directories([self.config.artifacts_root])
            logger.info(f"Configuration loaded from {config_filepath} and {params_filepath}.")
        except Exception as e:
            logger.error(f"Error loading configuration: {e}")
            raise

    def get_model_training_config(self) -> ModelTrainerconfig:
        try:
            # Fetching the model_trainer section from the config file
            config = self.config['model_trainer']  # Assuming ConfigBox returns a dict-like object
            params = self.params['TrainingArguments']
            create_directories([config['root_dir']])

            model_training_config = ModelTrainerconfig(
                root_dir=config['root_dir'],
                data_dir=config['data_dir'],
                test_size=params['test_size'],
                random_state_size=params['random_state_size'],
                n_estimators=params['n_estimators'],
                random_state = params['random_state']
            )

            logger.info("Model training configuration loaded successfully.")
            return model_training_config
        except KeyError as e:
            logger.error(f"Missing key in configuration: {e}")
            raise
        except Exception as e:
            logger.error(f"Error fetching model training config: {e}")
            raise



In [None]:

class ModelTraining:

    def __init__(self, config: ModelTrainerconfig):
        self.config = config
        self.params = config

    def model_training(self):
        try:
            # Fetch the data (assuming the path in `self.config.data_dir` is valid)
            df_data = pd.read_csv(self.config.data_dir + "Youtube_channel_data.csv")
            logger.info(f"Data loaded from {self.config.data_dir}.")
            # print(df_data)
            
            # # Example: assume df has features and a target column
            # # x = df_data.drop(['channel_id', 'channel_name', 'channel_start_date', 'inception_date', 'total_subscribers'], axis=1)
            # x = df_data[['total_views', 'total_likes', 'total_comments', 'total_no_of_videos', 'total_no_long_videos', 'days_since_start']]
            # y = df_data['total_subscribers']
            x = df_data.drop(columns='total_subscribers', axis= 1)
            y = df_data['total_subscribers']
            # print(x.columns)


            # # Split the data into train and test sets
            X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=self.params.test_size, random_state=self.params.random_state_size)
            logger.info(f"Data split into training and testing sets with test size: {self.params.test_size}.")

            # # Initialize the model (e.g., Random Forest)
            model = RandomForestRegressor(n_estimators=self.params.n_estimators, random_state=self.params.random_state)
            logger.info("Random Forest model initialized.")

            # # Train the model
            model.fit(X_train, y_train)
            logger.info("Model training completed.")

            logger.info(f"Model accuracy: {model.score(X_test, y_test):.2f}")
            y_pred = model.predict(X_test)
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)

            logger.info(f"Model evaluation completed. MAE: {mae:.2f}, MSE: {mse:.2f}, R^2: {r2:.2f}")

            # Save the trained model
            joblib.dump(model, self.config.root_dir + "model.pkl")
            logger.info(f"Model saved to {self.config.root_dir}.")

        except Exception as e:
            logger.error(f"An error occurred during model training: {e}")
            raise



In [9]:
try:
    config_manager = ConfigurationManager()
    modeltraining_config = config_manager.get_model_training_config()
    model_training = ModelTraining(config = modeltraining_config)
    model_training.model_training()
except Exception as e:
    print(f"An error occurred: {e}")

[2025-01-20 16:53:54,444: INFO: common: yaml file: config/config.yaml loaded successfully]


[2025-01-20 16:53:54,458: INFO: common: yaml file: params.yaml loaded successfully]
[2025-01-20 16:53:54,463: INFO: common: created directory at: artifacts]
[2025-01-20 16:53:54,470: INFO: 4062040296: Configuration loaded from config/config.yaml and params.yaml.]
[2025-01-20 16:53:54,473: INFO: common: created directory at: artifacts/model_trainer/]
[2025-01-20 16:53:54,478: INFO: 4062040296: Model training configuration loaded successfully.]
[2025-01-20 16:53:54,591: INFO: 1912118790: Data loaded from artifacts/data_analysis/.]
Index(['total_no_of_videos', 'total_no_short_videos', 'total_no_long_videos',
       'total_views', 'total_likes', 'total_comments', 'days_since_start',
       'days_since_inception'],
      dtype='object')
[2025-01-20 16:53:54,613: INFO: 1912118790: Data split into training and testing sets with test size: 0.2.]
[2025-01-20 16:53:54,623: INFO: 1912118790: Random Forest model initialized.]
[2025-01-20 16:53:54,642: INFO: 1912118790: Model training completed.]
[