# 1. Import the datasets and libraries, check shape, and datatype.

In [1]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
# Load the dataset
df = pd.read_csv('train.csv')
print(df.head())





       vidid  adview    views likes dislikes comment   published duration  \
0  VID_18655      40  1031602  8523      363    1095  2016-09-14  PT7M37S   
1  VID_14135       2     1707    56        2       6  2016-10-01  PT9M30S   
2   VID_2187       1     2023    25        0       2  2016-07-02  PT2M16S   
3  VID_23096       6   620860   777      161     153  2016-07-27  PT4M22S   
4  VID_10175       1      666     1        0       0  2016-06-29    PT31S   

  category  
0        F  
1        D  
2        C  
3        H  
4        D  


In [2]:
# Check shape and datatype
print(df.shape)
print(df.dtypes)

(14999, 9)
vidid        object
adview        int64
views        object
likes        object
dislikes     object
comment      object
published    object
duration     object
category     object
dtype: object


# 2. Visualise the dataset using plotting using heatmaps and plots.

In [None]:
# Visualize correlation heatmap
plt.figure(figsize=(10, 8))
print(df.corr())
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.show()

# Visualize data distributions
df.hist(bins=20, figsize=(15, 10))
plt.show()


# 3. Clean the dataset by removing missing values and other things.

In [3]:
# Remove missing values
df.dropna(inplace=True)
# Convert 'views', 'likes', 'dislikes', 'comment' to numeric
df['views'] = pd.to_numeric(df['views'], errors='coerce')
df['likes'] = pd.to_numeric(df['likes'], errors='coerce')
df['dislikes'] = pd.to_numeric(df['dislikes'], errors='coerce')
df['comment'] = pd.to_numeric(df['comment'], errors='coerce')
# Convert 'published' to datetime
df['published'] = pd.to_datetime(df['published'])

# Transform 'category' using one-hot encoding
df = pd.get_dummies(df, columns=['category'], drop_first=True)
# Extract 'duration' and convert to seconds
# Convert 'duration' to seconds
def duration_to_seconds(duration_str):
    duration_match = re.match(r'PT(\d+H)?(\d+M)?(\d+S)?', duration_str)

    if duration_match:
        hours = int(duration_match.group(1)[:-1]) if duration_match.group(1) else 0
        minutes = int(duration_match.group(2)[:-1]) if duration_match.group(2) else 0
        seconds = int(duration_match.group(3)[:-1]) if duration_match.group(3) else 0
        return hours * 3600 + minutes * 60 + seconds

    return 0

# Apply the function to convert 'duration' to seconds
df['duration_sec'] = df['duration'].apply(duration_to_seconds)

# Drop the original 'duration' column
df.drop(['duration'], axis=1, inplace=True)



In [4]:
print(df.head())

       vidid  adview      views   likes  dislikes  comment  published  \
0  VID_18655      40  1031602.0  8523.0     363.0   1095.0 2016-09-14   
1  VID_14135       2     1707.0    56.0       2.0      6.0 2016-10-01   
2   VID_2187       1     2023.0    25.0       0.0      2.0 2016-07-02   
3  VID_23096       6   620860.0   777.0     161.0    153.0 2016-07-27   
4  VID_10175       1      666.0     1.0       0.0      0.0 2016-06-29   

   category_B  category_C  category_D  category_E  category_F  category_G  \
0           0           0           0           0           1           0   
1           0           0           1           0           0           0   
2           0           1           0           0           0           0   
3           0           0           0           0           0           0   
4           0           0           1           0           0           0   

   category_H  duration_sec  
0           0           457  
1           0           570  
2       

# 4. Transform attributes into numerical values and other necessary transformations.

# 5. Normalize your data and split the data into training, validation, and test set in the appropriate ratio.

In [6]:
# Replace NaN with the mean of the corresponding column
df.fillna(df.mean(), inplace=True)

# Replace infinity values with a large finite value
df.replace([np.inf, -np.inf], np.finfo('float64').max, inplace=True)

# Separate features and target variable
X = df.drop(['vidid', 'adview', 'published'], axis=1)
y = df['adview']

# Normalize data excluding 'published'
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Combine normalized features and 'published' column
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_combined = pd.concat([X_scaled_df, df[['published']]], axis=1)

# Convert 'published' to numeric features
X_combined['published_year'] = X_combined['published'].dt.year
X_combined['published_month'] = X_combined['published'].dt.month
X_combined['published_day'] = X_combined['published'].dt.day
X_combined.drop(['published'], axis=1, inplace=True)

# Split data into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X_combined, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)






  df.fillna(df.mean(), inplace=True)
  df.fillna(df.mean(), inplace=True)


# 6. Use linear regression, Support Vector Regressor for training and get errors.

In [7]:
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_val)
lr_error = mean_squared_error(y_val, lr_preds)
print("Linear Regression Mean Squared Error:", lr_error)

Linear Regression Mean Squared Error: 13314200621.970387


In [8]:
# Support Vector Regressor
svr_model = SVR()
svr_model.fit(X_train, y_train)
svr_preds = svr_model.predict(X_val)
svr_error = mean_squared_error(y_val, svr_preds)
print("SVR Mean Squared Error:", svr_error)

SVR Mean Squared Error: 13355366737.251987


# 7. Use Decision Tree Regressor and Random Forest Regressors.

In [9]:
# Decision Tree Regressor
dt_model = DecisionTreeRegressor()
dt_model.fit(X_train, y_train)
dt_preds = dt_model.predict(X_val)
dt_error = mean_squared_error(y_val, dt_preds)
print("Decision Tree Mean Squared Error:", dt_error)

Decision Tree Mean Squared Error: 14670885157.435556


In [10]:
# Random Forest Regressor
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_val)
rf_error = mean_squared_error(y_val, rf_preds)
print("Random Forest Mean Squared Error:", rf_error)

Random Forest Mean Squared Error: 12871287119.847685


# 8. Build an artificial neural network and train it with different layers and hyperparameters. Experiment a little. Use Keras.

In [11]:
# Build Neural Network
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))




In [12]:
# Compile the model
model.compile(optimizer=Adam(), loss='mean_squared_error')


In [13]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_val, y_val))

Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x251ce59cdc0>

# 9. Pick the best model based on error as well as generalization.
Compare the errors obtained from different models and choose the one with the lowest validation error and good generalization.

10. Save your model and predict on the test set.

In [14]:
# Save the best model
best_model = rf_model  
# Predict on the test set
test_preds = best_model.predict(X_test)
test_error = mean_squared_error(y_test, test_preds)
print("Test Set Mean Squared Error:", test_error)

Test Set Mean Squared Error: 1141370214.6938925
