In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load CSV
df = pd.read_csv("../Data/dataset2/CAvideos.csv", quoting=3, on_bad_lines='skip')

# Drop rows with missing values in relevant columns
df = df.dropna(subset=['category_id', 'publish_time', 'tags', 'likes', 'dislikes', 'comment_count', 'description', 'views'])

# Feature engineering
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df = df.dropna(subset=['publish_time'])  # drop rows where conversion failed
df['publish_hour'] = df['publish_time'].dt.hour
df['publish_weekday'] = df['publish_time'].dt.weekday
df['num_tags'] = df['tags'].apply(lambda x: len(str(x).split('|')))
df['description_len'] = df['description'].apply(lambda x: len(str(x)))

# Ensure views, likes, dislikes, comment_count are numeric
df['views'] = pd.to_numeric(df['views'], errors='coerce')
df['likes'] = pd.to_numeric(df['likes'], errors='coerce')
df['dislikes'] = pd.to_numeric(df['dislikes'], errors='coerce')
df['comment_count'] = pd.to_numeric(df['comment_count'], errors='coerce')

# Drop any rows where conversion to numeric failed
df = df.dropna(subset=['views', 'likes', 'dislikes', 'comment_count'])

# Select features and target
features = ['category_id', 'publish_hour', 'publish_weekday', 'num_tags',
            'likes', 'dislikes', 'comment_count', 'description_len']
X = df[features]
y = df['views']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train regression model
reg = LinearRegression()
reg.fit(X_train, y_train)

# Predict views
y_pred = reg.predict(X_test)

# Create results DataFrame
result = X_test.copy()
result['actual_views'] = pd.to_numeric(y_test, errors='coerce')  # ensure numeric
result['predicted_views'] = y_pred

# Add trending column: True if actual views > predicted views
result['trending'] = result['actual_views'] > result['predicted_views']

# --- Linear Regression metrics ---
mae_lin = mean_absolute_error(y_test, y_pred_lin)
r2_lin = r2_score(y_test, y_pred_lin)



# Show results
top_results = result[['actual_views', 'predicted_views', 'trending']].head(30).reset_index(drop=True)
print(top_results)

# print metrics
print("Linear Regression:")
print(f"  MAE  = {mae_lin:,.0f}")
print(f"  R²   = {r2_lin:.4f}")


Linear Regression:
  MAE  = 547,784
  RMSE = 1,799,704
  R²   = 0.6293
    actual_views  predicted_views  trending
0          17340     1.691190e+05     False
1         299283     3.279176e+05     False
2         253117     5.174189e+05     False
3         426653     3.005684e+05      True
4         173360     2.267277e+05     False
5          74080     1.549839e+05     False
6          88251     2.744311e+05     False
7        9508610     3.707432e+06      True
8         326798     6.771954e+05     False
9          56927     2.097612e+05     False
10        478905     3.577999e+05      True
11       1452518     3.973976e+05      True
12         25757     2.771546e+05     False
13       1810163     6.605455e+05      True
14       3675189     6.119957e+06     False
15         57297     2.946583e+05     False
16         83274     3.051040e+05     False
17        534122    -7.553046e+04      True
18       9583159     6.265465e+06      True
19        818251     5.291155e+05      True
20   

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
# Load CSV
df = pd.read_csv("../Data/dataset2/CAvideos.csv", quoting=3, on_bad_lines='skip')

# Drop rows with missing values in relevant columns
df = df.dropna(subset=['category_id', 'publish_time', 'tags', 'likes', 'dislikes', 'comment_count', 'description', 'views'])

# Feature engineering
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df = df.dropna(subset=['publish_time'])
df['publish_hour'] = df['publish_time'].dt.hour
df['publish_weekday'] = df['publish_time'].dt.weekday
df['num_tags'] = df['tags'].apply(lambda x: len(str(x).split('|')))
df['description_len'] = df['description'].apply(lambda x: len(str(x)))

# Ensure numeric
df['views'] = pd.to_numeric(df['views'], errors='coerce')
df['likes'] = pd.to_numeric(df['likes'], errors='coerce')
df['dislikes'] = pd.to_numeric(df['dislikes'], errors='coerce')
df['comment_count'] = pd.to_numeric(df['comment_count'], errors='coerce')
df = df.dropna(subset=['views', 'likes', 'dislikes', 'comment_count'])

# Features and target
features = ['category_id', 'publish_hour', 'publish_weekday', 'num_tags',
            'likes', 'dislikes', 'comment_count', 'description_len']
X = df[features]
y = df['views']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict views
y_pred = rf.predict(X_test)

# Create results DataFrame
result = X_test.copy()
result['actual_views'] = pd.to_numeric(y_test, errors='coerce')
result['predicted_views'] = y_pred
result['trending'] = result['actual_views'] > result['predicted_views']


# --- Random Forest metrics ---
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)



# Show top 30 results
top_results = result[['actual_views', 'predicted_views', 'trending']].head(30).reset_index(drop=True)
print(top_results)

# print metrics
print("\nRandom Forest Regression:")
print(f"  MAE  = {mae_rf:,.0f}")
print(f"  R²   = {r2_rf:.4f}")


    actual_views  predicted_views  trending
0          17340     5.096146e+04     False
1         299283     3.241330e+05     False
2         253117     3.115188e+05     False
3         426653     2.880718e+05      True
4         173360     2.706227e+05     False
5          74080     6.106000e+04      True
6          88251     1.446200e+05     False
7        9508610     7.067027e+06      True
8         326798     6.345000e+05     False
9          56927     7.006602e+04     False
10        478905     4.453045e+05      True
11       1452518     1.089052e+06      True
12         25757     3.575892e+04     False
13       1810163     2.089668e+06     False
14       3675189     7.774356e+06     False
15         57297     7.116014e+04     False
16         83274     2.068534e+05     False
17        534122     4.905079e+05      True
18       9583159     8.958057e+06      True
19        818251     7.423886e+05      True
20        292525     5.802868e+05     False
21        609469     4.563271e+0