<a href="https://colab.research.google.com/github/BaruchG/tfidf-regression-experiments/blob/main/tf_idf_regression_production.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import math
from scipy.sparse import vstack
import pandas as pd
from sklearn.linear_model import LogisticRegression
from scipy.sparse import csr_matrix
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing._data import normalize
from sklearn.utils.validation import _deprecate_positional_args, check_array, FLOAT_DTYPES, check_is_fitted
from sklearn.utils.fixes import _astype_copy_false
import scipy.sparse as sp

In [None]:
def _document_frequency(X):
    """Count the number of non-zero values for each feature in sparse X."""
    if sp.isspmatrix_csr(X):
        return np.bincount(X.indices, minlength=X.shape[1])
    else:
        return np.diff(X.indptr)
      
class TfidfTransformerCustom(TransformerMixin, BaseEstimator):

    @_deprecate_positional_args
    def __init__(self, *, norm='l2', use_idf=True, smooth_idf=True,
                 sublinear_tf=False):
        self.norm = norm
        self.use_idf = use_idf
        self.smooth_idf = smooth_idf
        self.sublinear_tf = sublinear_tf

    def fit(self, X, y=None):
        X = check_array(X, accept_sparse=('csr', 'csc'))
        if not sp.issparse(X):
            X = sp.csr_matrix(X)
        dtype = X.dtype if X.dtype in FLOAT_DTYPES else np.float64

        if self.use_idf:
            n_samples, n_features = X.shape
            df = _document_frequency(X)
            df = df.astype(dtype, **_astype_copy_false(df))

            # perform idf smoothing if required
            df += int(self.smooth_idf)
            n_samples += int(self.smooth_idf)

            # log+1 instead of log makes sure terms with zero idf don't get
            # suppressed entirely.
            # print(n_samples)
            idf = np.log(n_samples / df) + 1
            # print(df)
            # print("idf", idf)
            self._idf_diag = sp.diags(idf, offsets=0,
                                      shape=(n_features, n_features),
                                      format='csr',
                                      dtype=dtype)

        return self

    def transform(self, X, copy=True):
        X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, copy=copy)
        if not sp.issparse(X):
            X = sp.csr_matrix(X, dtype=np.float64)

        n_samples, n_features = X.shape

        if self.sublinear_tf:
            np.log(X.data, X.data)
            X.data += 1

        if self.use_idf:
            # idf_ being a property, the automatic attributes detection
            # does not work as usual and we need to specify the attribute
            # name:
            check_is_fitted(self, attributes=["idf_"],
                            msg='idf vector is not fitted')

            expected_n_features = self._idf_diag.shape[0]
            if n_features != expected_n_features:
                raise ValueError("Input has n_features=%d while the model"
                                 " has been trained with n_features=%d" % (
                                     n_features, expected_n_features))
            # *= doesn't work
            # print("X", X.toarray())
            # tf * idf but tf is not yet normalized
            X = X * self._idf_diag
            # print("post", X.toarray())
            # print("Sum", sum(X.toarray()[3]))

        if self.norm:
            # normalizes it but in different order than tutorial so it's normalized post tf-idf computation without normalizing the tf, default is l2 if want length use l1
            X = normalize(X, norm=self.norm, copy=False)
            # print("post", X[3, 0])
        # X = X * self._idf_diag
        return X

    @property
    def idf_(self):
        # if _idf_diag is not set, this will raise an attribute error,
        # which means hasattr(self, "idf_") is False
        return np.ravel(self._idf_diag.sum(axis=0))

    @idf_.setter
    def idf_(self, value):
        value = np.asarray(value, dtype=np.float64)
        n_features = value.shape[0]
        self._idf_diag = sp.spdiags(value, diags=0, m=n_features,
                                    n=n_features, format='csr')

    def _more_tags(self):
        return {'X_types': 'sparse'}

In [None]:
imdb = pd.read_csv("/content/drive/My Drive/datasets/IMDB_Dataset.csv")
trainX = imdb['review'][:4000]
trainY = imdb['sentiment'][:4000]
validX = imdb['review'][4001:]
validY = imdb['sentiment'][4001:]

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
vectorized = vectorizer.fit_transform(trainX)
tfidfModel = TfidfTransformerCustom(use_idf=True, smooth_idf=False)
t = tfidfModel.fit(vectorized)
tout = t.transform(vectorized)
arr = tout.toarray()

In [None]:
clf = LogisticRegression(random_state=0).fit(arr, trainY)

In [None]:
validVectorized = vectorizer.transform(validX)
validtout = t.transform(validVectorized)
arrValid = validtout.toarray()

In [None]:
#Validation metrics
preds = clf.predict(arrValid)
print(classification_report(validY, preds))

              precision    recall  f1-score   support

    negative       0.86      0.85      0.85     22973
    positive       0.85      0.86      0.86     23026

    accuracy                           0.86     45999
   macro avg       0.86      0.86      0.86     45999
weighted avg       0.86      0.86      0.86     45999



In [None]:
#Training metrics
preds = clf.predict(arr)
print(classification_report(trainY, preds))

              precision    recall  f1-score   support

    negative       0.94      0.94      0.94      2027
    positive       0.94      0.94      0.94      1973

    accuracy                           0.94      4000
   macro avg       0.94      0.94      0.94      4000
weighted avg       0.94      0.94      0.94      4000

