In [1]:
from imblearn.pipeline import Pipeline  # imblearn's pipeline supports resampling steps
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import os

In [2]:
# Load and prepare your data
directory = r"/home/camiloav/Code/HomeSecurity/Classifier/dataset/"
df = pd.DataFrame()
for name in os.listdir(directory):
    with open(os.path.join(directory, name)) as f:
        print(f"Content of '{name}'")
        csvdf = pd.read_csv(f)
        df = pd.concat([df, csvdf])
print(df.shape)


Content of 'output_03.csv'
Content of 'output_02.csv'
Content of 'output_04.csv'
Content of 'output_01.csv'
Content of 'output_05.csv'
Content of 'output_06.csv'
(509073, 15)


In [3]:
values = df[['Rating', 'Content']].copy()
values.rename(columns={'Rating': 'label', 'Content': 'text'}, inplace=True)
values['label'] = values['label'].map({1:0, 2:0, 3:1, 4:1, 5:1, np.nan:1})
values.dropna(subset=['text'], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(
    values['text'], values['label'], test_size=0.2, random_state=42
)

In [None]:
# Build a pipeline that includes TF-IDF, dimensionality reduction (optional), SMOTE, and logistic regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svd', TruncatedSVD()),       # Dimensionality reduction to reduce memory footprint
    ('smote', SMOTE()),            # SMOTE oversampling
    ('clf', LogisticRegression(max_iter=1000))
])

# Set up a parameter grid to search over.
# You can tune parameters for the vectorizer, SVD, and the classifier.
param_grid = {
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__max_df': [0.75, 0.85],
    'svd__n_components': [100, 200],
    'clf__C': [0.1, 1, 10]
}

# Use GridSearchCV to test all combinations
grid_search = GridSearchCV(pipeline, param_grid, cv=3, scoring='f1_macro', n_jobs=-1)

# Run the grid search on your training data
grid_search.fit(X_train, y_train)

# Print the best parameters and evaluation on test set
print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
preds = best_model.predict(X_test)
print("Classification Report on Test Data:")
print(classification_report(y_test, preds))
