In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Load CSV
df = pd.read_csv("../Data/dataset2/CAvideos.csv", quoting=3, on_bad_lines='skip')

# Drop rows with missing values in relevant columns
df = df.dropna(subset=['category_id', 'publish_time', 'tags', 'likes', 'dislikes', 'comment_count', 'description', 'views'])

# Feature engineering
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df = df.dropna(subset=['publish_time'])  # drop rows where conversion failed
df['publish_hour'] = df['publish_time'].dt.hour
df['publish_weekday'] = df['publish_time'].dt.weekday
df['num_tags'] = df['tags'].apply(lambda x: len(str(x).split('|')))
df['description_len'] = df['description'].apply(lambda x: len(str(x)))

# Ensure views, likes, dislikes, comment_count are numeric
df['views'] = pd.to_numeric(df['views'], errors='coerce')
df['likes'] = pd.to_numeric(df['likes'], errors='coerce')
df['dislikes'] = pd.to_numeric(df['dislikes'], errors='coerce')
df['comment_count'] = pd.to_numeric(df['comment_count'], errors='coerce')

# Drop any rows where conversion to numeric failed
df = df.dropna(subset=['views', 'likes', 'dislikes', 'comment_count'])

# Select features and target
features = ['category_id', 'publish_hour', 'publish_weekday', 'num_tags',
            'likes', 'dislikes', 'comment_count', 'description_len']
X = df[features]
y = df['views']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train regression model
reg = LinearRegression()
reg.fit(X_train, y_train)

# Predict views
y_pred = reg.predict(X_test)

# Create results DataFrame
result = X_test.copy()
result['actual_views'] = pd.to_numeric(y_test, errors='coerce')  # ensure numeric
result['predicted_views'] = y_pred

# Add trending column: True if actual views > predicted views
result['trending'] = result['actual_views'] > result['predicted_views']

# Show results
top_results = result[['actual_views', 'predicted_views', 'trending']].head(300).reset_index(drop=True)
print(top_results)



     actual_views  predicted_views  trending
0           17340     1.691190e+05     False
1          299283     3.279176e+05     False
2          253117     5.174189e+05     False
3          426653     3.005684e+05      True
4          173360     2.267277e+05     False
..            ...              ...       ...
295         55016     1.455757e+05     False
296         78721     2.098104e+05     False
297         68714     1.632353e+05     False
298       1414923     1.122465e+06      True
299        208573     2.581388e+05     False

[300 rows x 3 columns]
