In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import tree
from math import exp,log,log2
import graphviz
import pydot
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

In [2]:
class Node:
    def __init__(self, data, target):
        self.left = None
        self.right = None
        self.data = data
        self.target = target
        self.split_feature = None
        self.split_value = None
        self.prediction = None

class DecisionTreeRegressor:
    def __init__(self, max_depth=3, min_samples_split=2):
        self.root = None
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        
    def _mse(self, target):
        return np.mean((target - np.mean(target)) ** 2)
    
    def _split(self, data, target):
        m, n = data.shape
        best_mse = np.inf
        best_feature = None
        best_value = None
        
        for feature in range(n):
            for value in np.unique(data[:, feature]):
                left_target = target[data[:, feature] <= value]
                right_target = target[data[:, feature] > value]
                
                if len(left_target) < self.min_samples_split or len(right_target) < self.min_samples_split:
                    continue
                
                mse = len(left_target) / m * self._mse(left_target) + len(right_target) / m * self._mse(right_target)
                
                if mse < best_mse:
                    best_mse = mse
                    best_feature = feature
                    best_value = value
        
        return best_feature, best_value
        
    def _build_tree(self, data, target, depth):
        node = Node(data, target)
        
        if depth == self.max_depth or len(data) < self.min_samples_split:
            node.prediction = np.mean(target)
            return node
        
        feature, value = self._split(data, target)
        left_data, left_target = data[data[:, feature] <= value], target[data[:, feature] <= value]
        right_data, right_target = data[data[:, feature] > value], target[data[:, feature] > value]
        
        if len(left_target) == 0 or len(right_target) == 0:
            node.prediction = np.mean(target)
            return node
        
        node.split_feature = feature
        node.split_value = value
        node.left = self._build_tree(left_data, left_target, depth+1)
        node.right = self._build_tree(right_data, right_target, depth+1)
        
        return node
    
    def fit(self, data, target):
        self.root = self._build_tree(data, target, 0)
        
    def predict(self, data):
        predictions = []
        for sample in data:
            node = self.root
            while node.left:
                if sample[node.split_feature] <= node.split_value:
                    node = node.left
                else:
                    node = node.right
            predictions.append(node.prediction)
        return np.array(predictions)

In [3]:
import numpy as np
from sklearn.tree import DecisionTreeRegressor

class AdaboostedDecisionTreeRegressor:
    def __init__(self, n_estimators=50, learning_rate=0.1, max_depth=3, min_samples_split=2):
        self.n_estimators = n_estimators
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []
        self.alpha = []
        
    def _fit_tree(self, data, target, weights):
        tree = DecisionTreeRegressor(max_depth=self.max_depth, min_samples_split=self.min_samples_split)

        # Replace NaN weights with mean weight
        mean_weight = np.nanmean(weights)
        weights[np.isnan(weights)] = mean_weight

        tree.fit(data, target, sample_weight=weights)
        predictions = tree.predict(data)
        mse = np.mean((predictions - target) ** 2)
        alpha = 0.5 * np.log((1 - mse) / mse)
        return tree, alpha

    
    def fit(self, data, target):
        weights = np.ones(len(data)) / len(data)
        
        
        for i in range(self.n_estimators):
            mean_weight = np.nanmean(weights)
            weights[np.isnan(weights)] = mean_weight
            
            tree, alpha = self._fit_tree(data, target, weights)
            self.trees.append(tree)
            self.alpha.append(alpha)
            predictions = tree.predict(data)
            weights *= np.exp(-alpha * (target - predictions))
            weights /= np.sum(weights)
            
    def predict(self, data):
        predictions = np.zeros(len(data))
        for i in range(self.n_estimators):
            predictions += self.alpha[i] * self.trees[i].predict(data)
        return predictions


In [4]:
def confusion_matrix_calc(y_pred, y_true):
    y_pred = pd.Series(y_pred).reset_index(drop=True)
    y_true = pd.Series(y_true).reset_index(drop=True)
    c = [[0, 0], 
            [0, 0]]
    for index in range(len(y_true)):
        if y_pred[index]==y_true[index]:
            if y_pred[index]==1:
                c[0][0] += 1
            else:
                c[1][1] += 1
        else:
            if y_pred[index]==1:
                c[0][1] += 1
            else:
                c[1][0] += 1
    return c

In [5]:
df=pd.read_csv('movies_dataset_processed.csv')
df

Unnamed: 0.1,Unnamed: 0,IMDb-rating,appropriate_for,director,downloads,industry,language,posted_date,release_date,run_time,storyline,title,views,writer,days_to_post,bucket
0,0,4.8,R,John Swab,304,Holywood,English,2023-02-20,2023-01-28,105,Doc\r\n facilitates a fragile truce between th...,Little Dixie,2794,John Swab,23,6.0
1,1,6.4,TV-PG,Paul Ziller,73,Holywood,English,2023-02-20,2023-02-05,84,Caterer\r\n Goldy Berry reunites with detectiv...,Grilling Season: A Curious Caterer Mystery,1002,John Christian Plummer,15,6.0
2,2,5.2,R,Ben Wheatley,1427,Holywood,"English,Hindi",2021-04-20,2021-06-18,107,As the world searches for a cure to a disastro...,In the Earth,14419,Ben Wheatley,59,7.0
3,3,6.5,R,Benjamin Caron,1781,Holywood,English,2023-02-13,2023-02-17,116,"Motivations are suspect, and expectations are ...",Sharper,18225,"Brian Gatewood, Alessandro Tanaka",4,4.0
4,4,6.9,PG-13,Ravi Kapoor,458,Holywood,English,2023-02-18,2022-12-02,80,An\r\n unmotivated South Asian American rapper...,Four Samosas,6912,Ravi Kapoor,78,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9897,9897,7.1,Not Rated,Biren Nag,1932,Bolywood,Hindi,1970-01-01,1962-05-11,158,"After a lusty Thakur rapes a young girl, she k...",Bees Saal Baad,6076,"Dhruva Chatterjee, Dev Kishan",2792,9.0
9898,9898,7.0,G,Guy Hamilton,2544,Holywood,"English,German,Polish,French",1970-01-01,1969-09-17,132,Historical reenactment of the air war in the e...,Battle of Britain,9319,"James Kennaway, Wilfred Greatorex, Derek Dempster",106,8.0
9899,9899,5.6,R,Barbara Topsøe-Rothenborg,12284,Holywood,"Spanish,German,English",2016-05-26,1970-01-01,90,"LOVE AT FIRST HICCUP is a charming, innocent, ...",Love at First Hiccup,36022,"Barbara Topsøe-Rothenborg, Søren Frellesen, De...",16947,10.0
9900,9900,7.1,Not Rated,Biren Nag,1932,Bolywood,Hindi,1970-01-01,1962-05-11,158,"After a lusty Thakur rapes a young girl, she k...",Bees Saal Baad,6077,"Dhruva Chatterjee, Dev Kishan",2792,9.0


In [6]:
cols_to_label_enc = ['appropriate_for', 'director', 'industry']
label_encs = {}
for col in cols_to_label_enc:
    label_encs[col] = {}
    unique_values = df[col].unique()
    for i, value in enumerate(unique_values):
        label_encs[col][value] = i
    df[col] = df[col].apply(lambda x: label_encs[col][x])

In [7]:
df['downloads'] = df['downloads'].apply(lambda x: float(str(x).replace(',', '')))
df['views'] = df['views'].apply(lambda x: float(str(x).replace(',', '')))

In [8]:
X = df[['IMDb-rating', 'appropriate_for', 'director', 'downloads', 'industry', 'run_time', 'views']]
y = df['days_to_post']

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15)

In [10]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [11]:
dt = DecisionTreeRegressor(max_depth=1, min_samples_split=2)
dt.fit(X_train, y_train)
ada_reg = AdaBoostRegressor(dt, n_estimators=10, random_state=42)
ada_reg.fit(X_train, y_train)

y_pred_train = ada_reg.predict(X_train)
y_pred_test = ada_reg.predict(X_test)

In [12]:
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print("MSE (train):", mse_train)
print("MSE (test):", mse_test)


MSE (train): 12797945.616647344
MSE (test): 11888227.544706177


In [13]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test,y_pred_test)
r2

-0.2763190831087299

In [14]:
from sklearn.ensemble import BaggingRegressor

bagging_regressor = BaggingRegressor(base_estimator=dt, n_estimators=10, max_samples=0.8, random_state=42)
bagging_regressor.fit(X_train, y_train)

y_pred = bagging_regressor.predict(X_test)
y_pred_test = bagging_regressor.predict(X_test)


mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

print("MSE (train):", mse_train)
print("MSE (test):", mse_test)

r2 = r2_score(y_test, y_pred)
print('R-squared score:', r2)

MSE (train): 12797945.616647344
MSE (test): 9052681.302750021
R-squared score: 0.028104916687392123




In [None]:
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
for i, ax in enumerate(axs.flat):
    n_estimate = i + 1
    dt_reg = DecisionTreeRegressor(max_depth=8, random_state=42)
    ada_reg = AdaBoostRegressor(dt_reg, n_estimators=n_estimate*100, random_state=42)
    ada_reg.fit(X_train, y_train)
    
    train_scores = list(ada_reg.staged_predict(X_train))
    train_deviances = [np.sum((y_train - y_pred) ** 2) / y_train.shape[0] for y_pred in train_scores]
    
    test_scores = list(ada_reg.staged_predict(X_test))
    test_deivances = [np.sum((y_test - y_pred) ** 2) / y_test.shape[0] for y_pred in test_scores]
    
    ax.plot(range(1, len(train_deviances) + 1), train_deviances, color='blue', label='Train Deviance')
    ax.plot(range(1, len(test_deivances) + 1), test_deivances, color='red', label='Test Deviance')
    ax.set_xlabel('Boosting Iterations')
    ax.set_ylabel('Deviance')
    ax.set_title('n_estimators = {}'.format(n_estimate*100)) 
plt.tight_layout()
plt.show()

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
for max_depth, ax in zip(range(1,11), axs.flat):
    dt_reg = DecisionTreeRegressor(max_depth=max_depth)
    ada_reg = AdaBoostRegressor(dt_reg, n_estimators=100, random_state=42)
    ada_reg.fit(X_train, y_train)
        
    train_scores = list(ada_reg.staged_predict(X_train))
    train_deviances = [np.sum((y_train - y_pred) ** 2) / y_train.shape[0] for y_pred in train_scores]
    
    test_scores = list(ada_reg.staged_predict(X_test))
    test_deivances = [np.sum((y_test - y_pred) ** 2) / y_test.shape[0] for y_pred in test_scores]
   
    ax.plot(range(1, len(train_deviances) + 1), train_deviances, color='blue', label='Train Deviance')
    ax.plot(range(1, len(test_deivances) + 1), test_deivances, color='red', label='Test Deviance')
    ax.set_xlabel('Boosting Iterations')
    ax.set_ylabel('Deviance')
    ax.set_title('Max Depth = {}'.format(max_depth)) 

plt.tight_layout()
plt.show()