In [None]:
# Importing the libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pickle
import requests
import json
#For Evaluating the model
from sklearn.metrics import mean_absolute_error, mean_squared_error
#preprocessing pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
#Combine preprocessing
from sklearn.compose import ColumnTransformer
import re

import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mlops_zoomcamp_youtube_pred_experiment")

from joblib import load, dump
from tqdm import tqdm
from evidently import ColumnMapping
from evidently.report import Report
from evidently.metrics import ColumnDriftMetric, DatasetDriftMetric, DatasetMissingValuesMetric

In [None]:
# Importing the dataset
data = pd.read_csv('/workspaces/MLOps_Zoomcamp_Project_YoutubePrediction/dataset/youtube.csv') 

# Clean numeric columns
def clean_numeric_column(column):
    cleaned_column = []
    for value in column:
        if isinstance(value, str):
            if 'K' in value:
                cleaned_column.append(float(re.sub(r'[^0-9.]', '', value)) * 1000)
            elif 'M' in value:
                cleaned_column.append(float(re.sub(r'[^0-9.]', '', value)) * 1000000)
            elif 'B' in value:
                cleaned_column.append(float(re.sub(r'[^0-9.]', '', value)) * 1000000000)
            else:
                cleaned_column.append(float(re.sub(r'[^0-9.]', '', value)))
        else:
            cleaned_column.append(value)
    return cleaned_column

data['VIEWS'] = clean_numeric_column(data['VIEWS'])
data['TOTAL_NUMBER_OF_VIDEOS'] = clean_numeric_column(data['TOTAL_NUMBER_OF_VIDEOS'])
data['SUBSCRIBERS'] = clean_numeric_column(data['SUBSCRIBERS'])

# Features and target
X = data[['VIEWS', 'TOTAL_NUMBER_OF_VIDEOS', 'CATEGORY']]
y = data['SUBSCRIBERS']

#Define preprocessing steps for numerical and categorial features
numeric_features = ['VIEWS', 'TOTAL_NUMBER_OF_VIDEOS']
numeric_transformer = Pipeline(steps=[
   ('scaler', StandardScaler())
])
categorical_features = ['CATEGORY']
categorical_transformer = Pipeline(steps=[
   ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#Combine preprocessing steps using ColumnTransformer
preprocessor = ColumnTransformer(
   transformers=[
       ('num', numeric_transformer, numeric_features),
       ('cat', categorical_transformer, categorical_features)
   ])

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1/3, random_state = 0)
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print(X_train.head(1))

# Model pipeline
regressor = Pipeline(steps=[
   ('preprocessor', preprocessor),
   ('regressor', LinearRegression())  # Change to LinearRegression or any other regressor you want to use
])
regressor.fit(X_train, y_train)

# Predicting the Test set results
y_pred = regressor.predict(X_test)

In [None]:
column_mapping = ColumnMapping(
    target=None,
    prediction='prediction',
    #categorical_features=categorical_features
)

In [None]:
report = Report(metrics=[
    ColumnDriftMetric(column_name='prediction'),
    DatasetDriftMetric(),
    DatasetMissingValuesMetric()
]
)

In [None]:
result = report.as_dict()
result

In [None]:
report.show(mode='inline')


In [None]:
result['metrics']