In [5]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

df = pd.read_csv('Youtube_videos_US_original.csv')

category_ids = df['category_id'].unique()

df_cleaned = pd.DataFrame()

In [6]:
for category_id in category_ids:
    df_category = df[df['category_id'] == category_id]

    index = int(len(df_category) * 0.8)
    df_train = df_category[:index]
    df_test = df_category[index:]

    X_train = df_train[["likes"]].to_numpy()
    Y_train = df_train["views"].to_numpy()
    X_test = df_test[["likes"]].to_numpy()
    Y_test = df_test["views"].to_numpy()

    model = LinearRegression()
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)

    mse = mean_squared_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)
    print(f"Category {category_id} - MSE: {mse:.2f}, R2 Score: {r2:.2f}")

    if r2 >= 0.5:
        Y_train_pred = model.predict(X_train)

        residuals = Y_train - Y_train_pred
        studentized_residuals = residuals / np.sqrt(mean_squared_error(Y_train, Y_train_pred))
        outliers = np.abs(studentized_residuals) > 2

        cook_distance = (residuals ** 2) / (model.coef_ ** 2 * X_train.var())
        outliers_cooks = cook_distance > 4 / len(X_train)

        outliers_indices = np.where(outliers)[0]
        outliers_indices_Cook = np.where(outliers_cooks)[0]

        all_outliers_indices = np.union1d(outliers_indices, outliers_indices_Cook)

        df_category_cleaned = df_category.drop(df_category.index[all_outliers_indices])

        cleaned_X = np.delete(X_train, all_outliers_indices, axis=0)
        cleaned_y = np.delete(Y_train, all_outliers_indices, axis=0)

        model.fit(cleaned_X, cleaned_y)
        Y_pred_cleaned = model.predict(X_test)

        mse_cleaned = mean_squared_error(Y_test, Y_pred_cleaned)
        r2_cleaned = r2_score(Y_test, Y_pred_cleaned)
        print(f"Category {category_id} - Cleaned MSE: {mse_cleaned:.2f}, Cleaned R2 Score: {r2_cleaned:.2f}")

        df_cleaned = pd.concat([df_cleaned, df_category_cleaned], ignore_index=True)
    else:
        df_cleaned = pd.concat([df_cleaned, df_category], ignore_index=True)

Category 22 - MSE: 2780084908615.23, R2 Score: 0.81
Category 22 - Cleaned MSE: 2791955034708.68, Cleaned R2 Score: 0.81
Category 24 - MSE: 8993723439417.94, R2 Score: 0.74
Category 24 - Cleaned MSE: 8983759982902.43, Cleaned R2 Score: 0.74
Category 23 - MSE: 1948005875765.37, R2 Score: 0.51
Category 23 - Cleaned MSE: 1947353186659.37, Cleaned R2 Score: 0.51
Category 28 - MSE: 12764605630482.05, R2 Score: 0.69
Category 28 - Cleaned MSE: 12759036430454.28, Cleaned R2 Score: 0.69
Category 1 - MSE: 18220661463127.03, R2 Score: 0.74
Category 1 - Cleaned MSE: 18155196716053.67, Cleaned R2 Score: 0.75
Category 25 - MSE: 3602114200387.24, R2 Score: 0.00
Category 17 - MSE: 5736174862908.23, R2 Score: 0.75
Category 17 - Cleaned MSE: 5704635012742.78, Cleaned R2 Score: 0.75
Category 10 - MSE: 160509780532214.59, R2 Score: 0.75
Category 10 - Cleaned MSE: 158699927485538.75, Cleaned R2 Score: 0.75
Category 15 - MSE: 1989007673160.81, R2 Score: 0.14
Category 27 - MSE: 407133283264.37, R2 Score: 0.78

In [7]:
df_cleaned.to_csv('USvideos_cleaned_all.csv', index=False)