In [102]:
from imblearn.pipeline import Pipeline  # Use imblearn's Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import FunctionTransformer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.decomposition import TruncatedSVD
from imblearn.over_sampling import RandomOverSampler
import os

In [98]:
# Load and prepare your data
directory = r"/home/camiloav/Code/HomeSecurity/Classifier/dataset/"
df = pd.DataFrame()
for name in os.listdir(directory):
    with open(os.path.join(directory, name)) as f:
        print(f"Content of '{name}'")
        csvdf = pd.read_csv(f)
        df = pd.concat([df, csvdf])
print(df.shape)


Content of 'output_03.csv'
Content of 'output_02.csv'
Content of 'output_04.csv'
Content of 'output_01.csv'
Content of 'output_05.csv'
Content of 'output_06.csv'
(509073, 15)


In [99]:
values = df[['Rating', 'Content']].copy()
values.rename(columns={'Rating': 'label', 'Content': 'text'}, inplace=True)
values['label'] = values['label'].map({1:0, 2:0, 3:1, 4:1, 5:1, np.nan:1})
values.dropna(subset=['text'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    values['text'], values['label'], test_size=0.2, random_state=42
)


In [103]:

# Define a transformer to convert sparse matrices to dense arrays
to_dense = FunctionTransformer(lambda x: x.todense(), accept_sparse=True)

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('ros', RandomOverSampler()),
    ('clf', LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
print("RandomOverSampler Pipeline Report:")
print(classification_report(y_test, preds))

RandomOverSampler Pipeline Report:
              precision    recall  f1-score   support

           0       0.96      0.85      0.90     87515
           1       0.45      0.76      0.56     14300

    accuracy                           0.83    101815
   macro avg       0.70      0.81      0.73    101815
weighted avg       0.88      0.83      0.85    101815

