In [26]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib


In [27]:
folder_path = 'data/'  # Replace with the path to your folder

# List all files in the folder
file_list = os.listdir(folder_path)

# Filter out CSV files
csv_files = [file for file in file_list if file.endswith('.csv')]

In [28]:
# Load and combine data from CSV files in the folder
combined_data = pd.concat([pd.read_csv(os.path.join(folder_path, file), low_memory=False) for file in csv_files], ignore_index=True)

In [29]:
# Preprocessing
combined_data.drop(['AnoCalendario', 'DataArquivamento', 'NumeroCNPJ'], axis=1, inplace=True)
combined_data.fillna(value={'SexoConsumidor': 'N/A', 'FaixaEtariaConsumidor': 'N/A'}, inplace=True)

label_encoder = LabelEncoder()
categorical_columns = ['CodigoRegiao', 'UF', 'Tipo', 'SexoConsumidor', 'FaixaEtariaConsumidor', 'DescCNAEPrincipal']
for col in categorical_columns:
    combined_data[col] = label_encoder.fit_transform(combined_data[col])

features = ['CodigoRegiao', 'UF', 'Tipo', 'SexoConsumidor', 'FaixaEtariaConsumidor', 'DescCNAEPrincipal']
label = 'Atendida'

X = combined_data[features]
y = combined_data[label]

In [30]:
# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Scaling the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [32]:
# Training a Logistic Regression model with increased max_iter
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

In [33]:
# Predictions
y_pred = model.predict(X_test_scaled)

In [34]:
# Create a DataFrame to pretty print the results
results_df = pd.DataFrame({
    'Features': X_test_scaled.tolist(),
    'True Labels': y_test.tolist(),
    'Predicted Labels': y_pred.tolist()
})

In [35]:
# Pretty print the input features and predicted labels
input_data = X_test.copy()
input_data['Actual_Label'] = y_test
input_data['Predicted_Label'] = y_pred
print(input_data)

         CodigoRegiao  UF  Tipo  SexoConsumidor  FaixaEtariaConsumidor  \
1157600             4   8     1               0                      6   
597739              1   1     1               0                      5   
1117229             3  17     1               1                      4   
615814              1   4     1               0                      3   
515265              1   9     1               0                      4   
...               ...  ..   ...             ...                    ...   
667133              4  12     1               1                      0   
29867               1   1     1               1                      4   
33711               2  10     1               1                      0   
876795              2  10     1               0                      4   
29935               1   1     1               1                      6   

         DescCNAEPrincipal Actual_Label Predicted_Label  
1157600                506            N              

In [36]:
# Print the results DataFrame
pd.set_option("display.max_colwidth", None)  # Display full content of cells
print(results_df)

                                                                                                                               Features  \
0        [1.8067415353193315, -0.7940837888983284, 0.05278629093154668, -0.9274421810633475, 0.7939124969480256, -0.010215353958780623]   
1         [-0.8028800372029462, -1.7458500791350475, 0.05278629093154668, -0.9274421810633475, 0.2905761190680759, -1.1193763954035048]   
2          [0.9368676778119055, 0.4296157271203103, 0.05278629093154668, 1.0065345089537876, -0.21276025881187374, -0.3040956298971263]   
3         [-0.8028800372029462, -1.337950240462168, 0.05278629093154668, -0.9274421810633475, -0.7160966366918234, -0.6358959414404198]   
4       [-0.8028800372029462, -0.6581171760073685, 0.05278629093154668, -0.9274421810633475, -0.21276025881187374, -1.2015364725475584]   
...                                                                                                                                 ...   
241210      [1.806741535319

In [37]:
# Evaluation
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           N       0.62      0.02      0.04     91359
           S       0.62      0.99      0.77    149856

    accuracy                           0.62    241215
   macro avg       0.62      0.51      0.41    241215
weighted avg       0.62      0.62      0.49    241215



In [38]:
# dump model
joblib.dump(model, 'models/binary_classification.joblib')

['models/binary_classification.joblib']