In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import math
import pandas as pd

import datetime

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
    

In [2]:

import numpy as np

def sigmoid(x):
    return 1/(1+np.exp(-x))


class SimpleLogisticRegression:
    def __init__(self, learning_rate=0.01, n_iteration=1000):
        self.learning_rate = learning_rate
        self.n_iteration = n_iteration
    
    def fit(self, X, y):
        X = np.array(X)
        y = np.array(y)
        n_samples, n_features = X.shape
        self.weights = np.zeros((n_features, 1))
        self.bias = 0

        for _ in range(self.n_iteration):
            linear_predictions = np.dot(X, self.weights) + self.bias
            prediction = sigmoid(linear_predictions)
            dw = (1/n_samples) * np.dot(X.T, (prediction - y))
            db = (1/n_samples) * np.sum(prediction - y)
            self.weights -= self.learning_rate * np.dot(X.T, (prediction - y)).reshape(-1, 1)
            self.bias -= self.learning_rate * db
    
    def predict(self, X):
        linear_predictions = np.dot(X, self.weights) + self.bias
        prediction = sigmoid(linear_predictions)
        return np.round(prediction)
    
    def compute_loss(self,  X, y):
        y_pred=sigmoid(np.dot(self.w,X.T))
        return -(1/X.shape[0])*np.sum((y*np.log(y_pred))+(1-y)*np.log(1-y_pred))
    
    def accuracy(self,y_pred, y_test):
        return np.sum(y_pred==y_test)/len(y_test)
    

In [3]:
df=pd.read_csv('movies_dataset_processed.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,IMDb-rating,appropriate_for,director,downloads,industry,language,posted_date,release_date,run_time,storyline,title,views,writer,days_to_post,bucket
0,0,4.8,R,John Swab,304,Holywood,English,2023-02-20,2023-01-28,105,Doc\r\n facilitates a fragile truce between th...,Little Dixie,2794,John Swab,23,6.0
1,1,6.4,TV-PG,Paul Ziller,73,Holywood,English,2023-02-20,2023-02-05,84,Caterer\r\n Goldy Berry reunites with detectiv...,Grilling Season: A Curious Caterer Mystery,1002,John Christian Plummer,15,6.0
2,2,5.2,R,Ben Wheatley,1427,Holywood,"English,Hindi",2021-04-20,2021-06-18,107,As the world searches for a cure to a disastro...,In the Earth,14419,Ben Wheatley,59,7.0
3,3,6.5,R,Benjamin Caron,1781,Holywood,English,2023-02-13,2023-02-17,116,"Motivations are suspect, and expectations are ...",Sharper,18225,"Brian Gatewood, Alessandro Tanaka",4,4.0
4,4,6.9,PG-13,Ravi Kapoor,458,Holywood,English,2023-02-18,2022-12-02,80,An\r\n unmotivated South Asian American rapper...,Four Samosas,6912,Ravi Kapoor,78,7.0


In [4]:
df['release_year'] = pd.to_datetime(df['release_date']).dt.year
df['release_year']

0       2023
1       2023
2       2021
3       2023
4       2022
        ... 
9897    1962
9898    1969
9899    1970
9900    1962
9901    1969
Name: release_year, Length: 9902, dtype: int32

In [5]:
# from sklearn import preprocessing 
# label_encoder = preprocessing.LabelEncoder()
# df['appropriate_for']= label_encoder.fit_transform(df['appropriate_for'])
# df['IMDb-rating']= label_encoder.fit_transform(df['IMDb-rating'])
# df['views'] = df['views'].str.replace(',', '').astype(int)
# df['downloads'] = df['downloads'].str.replace(',', '').astype(int)

cols_to_label_enc = ['appropriate_for', 'director', 'industry']
label_encs = {}
for col in cols_to_label_enc:
    label_encs[col] = {}
    unique_values = df[col].unique()
    for i, value in enumerate(unique_values):
        label_encs[col][value] = i
    df[col] = df[col].apply(lambda x: label_encs[col][x])
    
df['downloads'] = df['downloads'].apply(lambda x: float(str(x).replace(',', '')))
df['views'] = df['views'].apply(lambda x: float(str(x).replace(',', '')))

In [6]:
test_size = 0.3
num_test_samples = int(test_size * df.shape[0])
num_train_samples = df.shape[0] - num_test_samples
df_train = df.sample(num_train_samples, random_state=42)
df_test = df.drop(df_train.index)

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [24]:

X_cols = ['IMDb-rating', 'appropriate_for', 'downloads','run_time', 'views', 'release_year']
Y_cols=['days_to_post']

X_train = df_train[X_cols].values
y_train = df_train[Y_cols].values.reshape(-1, 1)

X_test = df_test[X_cols].values
y_test = df_test[Y_cols].values.reshape(-1, 1)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [25]:
clf = SimpleLogisticRegression(learning_rate = 0.1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

acc = clf.accuracy(y_pred, y_test)
print(acc)

0.21346801346801347


In [26]:
# SKLEARN 
    
from sklearn.linear_model import LogisticRegression

sk_lr=LogisticRegression()
sk_lr.fit(X_train,y_train.ravel())


logreg_train_accuracy = sk_lr.score(X_train,y_train)
logreg_test_accuracy = sk_lr.score(X_test, y_test)

trainDataPred = clf.predict(X_train)
testDataPred = clf.predict(X_test)

train_error = np.sum(trainDataPred != y_train) / float(y_train.shape[0])
test_error = np.sum(testDataPred != y_test) / float(y_test.shape[0])

print("\nScikit-learn logistic regression model with default parameters:")
print("Training accuracy: {:.3f}".format(logreg_train_accuracy))
print("Test accuracy: {:.3f}\n".format(logreg_test_accuracy))
print("Train Error: ",train_error)
print("Test Error: ",test_error)



Scikit-learn logistic regression model with default parameters:
Training accuracy: 0.203
Test accuracy: 0.204

Train Error:  0.7919792267743797
Test Error:  0.7865319865319865


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

r2 = r2_score(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)

print(f'Mean squared error: {mse:.2f}')
print(f'R^2 score: {r2:.2f}')

Mean squared error: 10464912.88
R^2 score: -0.09


In [28]:
bestR2 = -0.09
accuracy = 0

for itera in [10, 100,1000,10000]:
    for a in [0.01,0.1, 0.33]:
            
            #INSERT CODE HERE
        clf = SimpleLogisticRegression(learning_rate = a, n_iteration = itera)
        clf.fit(X_train, y_train)
        train_pred = clf.predict(X_test)
        
        logreg_test_accuracy = sk_lr.score(X_test, y_test)
        logreg_train_accuracy = sk_lr.score(X_train, y_train)
        
        print(f"Iterations: {iter}, Learning rate: {a}\nTest accuracy: {logreg_test_accuracy}, Train accuracy: {logreg_train_accuracy}")
         
        r2 = r2_score(y_test,train_pred)    
        print(f'R^2 score: {r2:.5f}\n\n')
        
        if bestR2 < r2:
            bestR2 = r2
            best_iter = iter
            best_a = a
            accuracy = acc
    
    
print("Best Learning Rate is ", best_a, " for ", best_iter, " iteration")
print("Best R2 score obtained is: ", bestR2)

Iterations: 10000, Learning rate: 0.01
Test accuracy: 0.2037037037037037, Train accuracy: 0.20340450086555106
R^2 score: -0.08725


Iterations: 10000, Learning rate: 0.1
Test accuracy: 0.2037037037037037, Train accuracy: 0.20340450086555106
R^2 score: -0.08725


Iterations: 10000, Learning rate: 0.33
Test accuracy: 0.2037037037037037, Train accuracy: 0.20340450086555106
R^2 score: -0.08725


Iterations: 10000, Learning rate: 0.01
Test accuracy: 0.2037037037037037, Train accuracy: 0.20340450086555106
R^2 score: -0.08725


Iterations: 10000, Learning rate: 0.1
Test accuracy: 0.2037037037037037, Train accuracy: 0.20340450086555106
R^2 score: -0.08725


Iterations: 10000, Learning rate: 0.33
Test accuracy: 0.2037037037037037, Train accuracy: 0.20340450086555106
R^2 score: -0.08725


Iterations: 10000, Learning rate: 0.01
Test accuracy: 0.2037037037037037, Train accuracy: 0.20340450086555106
R^2 score: -0.08725


Iterations: 10000, Learning rate: 0.1
Test accuracy: 0.2037037037037037, Train