In [8]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from scipy.optimize import fmin_tnc
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [9]:
class MyLogisticRegression:

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def net_input(theta, x):
        return np.dot(x, theta)

    def probability(self, theta, x):
        return self.sigmoid(self.net_input(theta, x))

    def cost_function(self, theta, x, y):
        m = x.shape[0]
        total_cost = -(1 / m) * np.sum(
            y * np.log(self.probability(theta, x)) + (1 - y) * np.log(
                1 - self.probability(theta, x)))
        return total_cost

    def gradient(self, theta, x, y):
        m = x.shape[0]
        return (1 / m) * np.dot(x.T, self.sigmoid(self.net_input(theta, x)) - y)

    def fit(self, x, y, theta):
        opt_weights = fmin_tnc(func=self.cost_function, x0=theta, fprime=self.gradient,
                               args=(x, y.flatten()))
        self.w_ = opt_weights[0]
        return self

    def predict(self, x, probab_threshold=0.5):
        theta = self.w_[:, np.newaxis]
        predicted_classes = self.probability(theta, x)
        predicted_classes = (predicted_classes >= probab_threshold).astype(int)
        predicted_classes = predicted_classes.flatten()
        return predicted_classes


In [10]:
def load_data(path):
    marks_df = pd.read_csv(path)
    return marks_df

In [11]:
data = load_data("datasets/clean_tmdb.csv")

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [12]:
data.head()

Unnamed: 0,budget,popularity,gross,duration,num_voted_users,title_year,director_name,actor_1_name,actor_2_name,actor_3_name,...,Romance,Horror,Mystery,History,War,Music,Documentary,Foreign,TV Movie,nice
0,5.107181,4.053183,16.615709,2.438596,8.999729,0.526213,-0.480838,1.148212,1.710379,1.28907,...,-0.478229,-0.348064,-0.27949,-0.20681,-0.175806,-0.200152,-0.153099,-0.084436,-0.040846,1
1,6.654402,3.696258,5.396331,2.748263,3.0862,0.365076,-0.649452,-0.116096,0.826611,0.123064,...,-0.478229,-0.348064,-0.27949,-0.20681,-0.175806,-0.200152,-0.153099,-0.084436,-0.040846,1
2,5.303653,2.699638,4.903054,1.81926,3.058657,1.009625,1.097744,-1.024816,-1.153637,0.384027,...,-0.478229,-0.348064,-0.27949,-0.20681,-0.175806,-0.200152,-0.153099,-0.084436,-0.040846,1
3,5.426449,2.854798,6.15744,2.57131,6.817394,0.767919,-1.243964,-1.153223,0.583511,-0.632063,...,-0.478229,-0.348064,-0.27949,-0.20681,-0.175806,-0.200152,-0.153099,-0.084436,-0.040846,1
4,5.672039,0.705198,1.239734,1.111448,1.161467,0.767919,-1.632068,1.416548,0.349275,1.166917,...,-0.478229,-0.348064,-0.27949,-0.20681,-0.175806,-0.200152,-0.153099,-0.084436,-0.040846,1


In [13]:
X = np.c_[np.ones((X.shape[0], 1)), X]
y = y[:, np.newaxis]
theta = np.zeros((X.shape[1], 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [31]:
model = MyLogisticRegression()
model.fit(X_train, y_train, theta)

predicted_classes = model.predict(X_train)
train_accuracy = accuracy_score(predicted_classes, y_train.flatten())
train_report = classification_report(predicted_classes, y_train.flatten())

predicted_classes = model.predict(X_test)
test_accuracy = accuracy_score(predicted_classes, y_test.flatten())
test_report = classification_report(predicted_classes, y_test.flatten())

print("My log reg:")
print("\nTrain:\n\naccuracy: {}".format(train_accuracy))
print("report:")
print(train_report)
print('*'*55)
print("\nTest:\n\naccuracy: {}".format(test_accuracy))
print("report:")
print(test_report)

My log reg:

Train:

accuracy: 0.7221876942200124
report:
              precision    recall  f1-score   support

           0       0.67      0.68      0.67      1359
           1       0.76      0.75      0.76      1859

    accuracy                           0.72      3218
   macro avg       0.72      0.72      0.72      3218
weighted avg       0.72      0.72      0.72      3218

*******************************************************

Test:

accuracy: 0.7419558359621451
report:
              precision    recall  f1-score   support

           0       0.69      0.70      0.69       657
           1       0.78      0.77      0.78       928

    accuracy                           0.74      1585
   macro avg       0.73      0.74      0.73      1585
weighted avg       0.74      0.74      0.74      1585



In [30]:
# Using scikit-learn
model = LogisticRegression()
model.fit(X_train, y_train)

predicted_classes = model.predict(X_train)
train_accuracy = accuracy_score(predicted_classes, y_train.flatten())
train_report = classification_report(predicted_classes, y_train.flatten())

predicted_classes = model.predict(X_test)
test_accuracy = accuracy_score(predicted_classes, y_test.flatten())
test_report = classification_report(predicted_classes, y_test.flatten())

print("Scikit-learn:")
print("\nTrain:\n\naccuracy: {}".format(train_accuracy))
print("report:")
print(train_report)
print('*'*55)
print("\nTest:\n\naccuracy: {}".format(test_accuracy))
print("report:")
print(test_report)

Scikit-learn:

Train:

accuracy: 0.7688004972032318
report:
              precision    recall  f1-score   support

           0       0.72      0.74      0.73      1349
           1       0.81      0.79      0.80      1869

    accuracy                           0.77      3218
   macro avg       0.76      0.76      0.76      3218
weighted avg       0.77      0.77      0.77      3218

*******************************************************

Test:

accuracy: 0.7753943217665615
report:
              precision    recall  f1-score   support

           0       0.73      0.73      0.73       660
           1       0.81      0.80      0.81       925

    accuracy                           0.78      1585
   macro avg       0.77      0.77      0.77      1585
weighted avg       0.78      0.78      0.78      1585

