In [None]:
import pandas as pd

data = pd.read_csv("youtube_data.csv")
data.describe()

# You only need the other line if you find empty cells in the table
# data = data.dropna(axis=0)
# There are also other methods with dealing with missing data - imputing (putting in averages) and removing the entire colummn

Unnamed: 0,duration,bitrate,bitrate(video),height,width,frame rate,frame rate(est.),views,likes,comments
count,17589.0,17589.0,17589.0,17589.0,17589.0,17589.0,17589.0,17589.0,17589.0,17589.0
mean,241.551936,1271.354369,1150.418443,766.78117,504.591961,26.467639,9.471172,68941.58,208.862641,12.899312
std,493.026994,1375.359875,1351.800202,467.289304,262.727746,6.039748,13.253197,3537491.0,8477.504735,225.839378
min,1.0,0.0,0.0,108.0,88.0,3.75,0.0,0.0,0.0,0.0
25%,51.0,437.0,326.0,426.0,320.0,25.0,0.0,38.0,0.0,0.0
50%,135.0,743.0,632.0,640.0,480.0,29.97,0.0,152.0,1.0,0.0
75%,268.0,1293.0,1184.0,960.0,720.0,29.97,25.0,800.0,4.0,1.0
max,25845.0,22421.0,22229.0,2592.0,1944.0,59.08,30.0,410384900.0,836981.0,16634.0


In [None]:
y = data.views
features = ['duration','bitrate','frame rate','likes','comments']
X = data[features]

# break up the data into training data and testing data
# remember - the random state number isn't important other than it is used to make reproducible results
# if other functions call for a random_state number, they do not have to match, they only have to remain fixed for reproducible results
from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X,y,random_state = 1)

# Compare the preformance of different ML models



# First, the Decision Tree without leaf optimization
from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor(random_state = 2)
model1.fit(train_X, train_y)

from sklearn.metrics import mean_absolute_error

print("The Mean Absolute Error for the Decision Tree ML model without leaf optimization is",mean_absolute_error(model1.predict(val_X),val_y))

#Second, the Decision Tree with leaf optimization
def get_mae(leaves, train_X, val_X, train_y, val_y):
  model2 = DecisionTreeRegressor(max_leaf_nodes=leaves, random_state=55)
  model2.fit(train_X, train_y)
  return mean_absolute_error(model2.predict(val_X), val_y)

leaves = [50,500,5000]
lowest = get_mae(5,train_X, val_X, train_y, val_y)
for i in leaves:
  if get_mae(i,train_X, val_X, train_y, val_y)<lowest:
    lowest = get_mae(i,train_X, val_X, train_y, val_y)
print("The Mean Absolute Error for the Decision Tree ML model with leaf optimization is",lowest)

#Third, the Random Forest
from sklearn.ensemble import RandomForestRegressor
model3 = RandomForestRegressor(n_estimators=50, random_state = 5)
model3.fit(train_X, train_y)
print("The Mean Absolute Error for the Random Tree ML model is",mean_absolute_error(model3.predict(val_X),val_y))

#Fourth XGBoost
from xgboost import XGBRegressor
model4 = XGBRegressor(n_estimators = 1000, learning_rate = 0.05, verbosity = 1, early_stopping_rounds=10)
model4.fit(train_X, train_y,eval_set=[(val_X, val_y)])
print("The Mean Absolute Error for the XGBoost model is", mean_absolute_error(model4.predict(val_X), val_y))




The Mean Absolute Error for the Decision Tree ML model without leaf optimization is 85031.5957253297
The Mean Absolute Error for the Decision Tree ML model with leaf optimization is 83774.18508155244
The Mean Absolute Error for the Random Tree ML model is 53807.63474279435
[0]	validation_0-rmse:2063600.03748
[1]	validation_0-rmse:2071173.08195
[2]	validation_0-rmse:2087323.13217
[3]	validation_0-rmse:2111020.71740
[4]	validation_0-rmse:2141212.93666
[5]	validation_0-rmse:2176943.92359
[6]	validation_0-rmse:2217285.59263
[7]	validation_0-rmse:2261438.91323
[8]	validation_0-rmse:2309296.99148
[9]	validation_0-rmse:2322661.00655
[10]	validation_0-rmse:2335421.06136
The Mean Absolute Error for the XGBoost model is 118114.9921875
