## Nombre: Alberto José Mendoza Peñaloza
## Prueba Ingeniero de IA Mercately

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#0. Importar librerías

In [2]:
# Libreria de NLP
import nltk
nltk.download('punkt')
nltk.download("stopwords")
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer

# Librerias de manejo de datos
import os
import re
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import unicodedata

# Entrenamiento y Pipeline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


#1. Cargar datos

In [None]:
data_path = "/content/drive/MyDrive/TESTS/Mercately/data/"

In [None]:
def read_txt_file(path):
  textfile = open(path, "r")
  content = textfile.read().split('\n')
  textfile.close()
  filtered_list = list(filter(lambda s: ':' in s, content))
  chat = "".join(filtered_list)
  return chat

In [None]:
data = {'class':[],
        'text':[]}

class0_path = data_path+"class0"
for f in os.listdir(class0_path):
  path = class0_path+'/'+f
  data['text'].append(read_txt_file(path))
  data['class'].append(0)


class1_path = data_path+"class1"
for f in os.listdir(class1_path):
  path = class1_path+'/'+f
  data['text'].append(read_txt_file(path))
  data['class'].append(1)

In [None]:
data = pd.DataFrame(data)

In [None]:
data.sample(5)

Unnamed: 0,class,text
30,1,Vendedor: ¡Hola! Bienvenido a nuestra tienda d...
12,0,Vendedor: ¡Hola! Bienvenido a nuestra tienda d...
17,0,"Vendedor: Hola, ¿en qué puedo ayudarte hoy? Te..."
35,1,Vendedor: ¡Hola! ¿En qué puedo ayudarte hoy?Co...
20,1,Vendedor: ¡Hola! Bienvenido a nuestra tienda. ...


#2. Preparación de datos

## 2.1 Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(['class'],axis=1),data['class'], test_size=0.2, random_state=24)

## 2.2 Limpieza de datos

In [None]:
def quitar_tildes(texto):
    # Normalizar el texto en forma NFD (Normalization Form D)
    texto_normalizado = unicodedata.normalize('NFD', texto)
    # Filtrar los caracteres que no son marcas diacríticas
    texto_sin_tildes = ''.join(c for c in texto_normalizado if unicodedata.category(c) != 'Mn')
    # Devolver el texto sin tildes
    return texto_sin_tildes

In [None]:
# Data Cleaning
text = X_train['text'].iloc[0]
# getting all the text in lower case
text = text.lower()
# get accent marks off
text = quitar_tildes(text)
# Filter punctuation marks and exlude numbers
text = re.sub(r'[^a-z\s]', '', text)

## 2.3 Tokenization


Get unique words

In [None]:
tonekized_text = word_tokenize(text)

## 2.4 Stop Words

In [None]:
stop_words = set(stopwords.words("spanish"))
words = [word for word in tonekized_text if word not in stop_words]

## 2.5 Lematización

In [None]:
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]

## 2.6 Stemming

In [None]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words]

## 2.7 Vectorization

use count vectorizer to convert categorical features in numerical

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(words)

## 2.8 Pipeline de Preparación de datos

- Haremos un pipeline con stemming y otro pipeline con lemmatizing.

In [None]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
      return self

    def quitar_tildes(self,texto):
      # Normalizar el texto en forma NFD (Normalization Form D)
      texto_normalizado = unicodedata.normalize('NFD', texto)
      # Filtrar los caracteres que no son marcas diacríticas
      texto_sin_tildes = ''.join(c for c in texto_normalizado if unicodedata.category(c) != 'Mn')
      # Devolver el texto sin tildes
      return texto_sin_tildes

    def manage_stop_words(self,tonekized_text):
      stop_words = set(stopwords.words("spanish"))
      words = [word for word in tonekized_text if word not in stop_words]
      return words

    def lemmatization(self,tonekized_text):
      lemmatizer = WordNetLemmatizer()
      words = [lemmatizer.lemmatize(word) for word in tonekized_text]
      return words

    def transform(self, X, y=None):
      X_copy = X.copy()

      # Data Cleaning
      X_copy['text'] = X_copy['text'].str.lower()
      X_copy['text'] = X_copy['text'].apply(lambda x : self.quitar_tildes(x))
      X_copy['text'] = X_copy['text'].apply(lambda x : re.sub(r'[^a-z\s]', '', x))
      #Tokenization
      X_copy['text'] = X_copy['text'].apply(lambda x : word_tokenize(x))
      #stop words
      X_copy['text'] = X_copy['text'].apply(lambda x : self.manage_stop_words(x))
      #lemmatization
      X_copy['text'] = X_copy['text'].apply(lambda x : self.lemmatization(x))

      X_copy['text'] = X_copy['text'].apply(lambda x : " ".join(x))

      vectorizer = CountVectorizer()
      X = vectorizer.fit_transform(X_copy['text'])
      vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

      return vectorized_df

pipe_preparacion_lemm = Pipeline(steps=[
    ('custom_proccesing', CustomTransformer()),
    ('clf', SVC()),
])

In [None]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
      return self

    def quitar_tildes(self,texto):
      # Normalizar el texto en forma NFD (Normalization Form D)
      texto_normalizado = unicodedata.normalize('NFD', texto)
      # Filtrar los caracteres que no son marcas diacríticas
      texto_sin_tildes = ''.join(c for c in texto_normalizado if unicodedata.category(c) != 'Mn')
      # Devolver el texto sin tildes
      return texto_sin_tildes

    def manage_stop_words(self,tonekized_text):
      stop_words = set(stopwords.words("spanish"))
      words = [word for word in tonekized_text if word not in stop_words]
      return words

    def stemming(self,tonekized_text):
      stemmer = PorterStemmer()
      stemmed_words = [stemmer.stem(word) for word in tonekized_text]
      return words

    def transform(self, X, y=None):
      X_copy = X.copy()

      # Data Cleaning
      X_copy['text'] = X_copy['text'].str.lower()
      X_copy['text'] = X_copy['text'].apply(lambda x : self.quitar_tildes(x))
      X_copy['text'] = X_copy['text'].apply(lambda x : re.sub(r'[^a-z\s]', '', x))
      #Tokenization
      X_copy['text'] = X_copy['text'].apply(lambda x : word_tokenize(x))
      #stop words
      X_copy['text'] = X_copy['text'].apply(lambda x : self.manage_stop_words(x))
      #lemmatization
      X_copy['text'] = X_copy['text'].apply(lambda x : self.stemming(x))

      X_copy['text'] = X_copy['text'].apply(lambda x : " ".join(x))

      vectorizer = CountVectorizer()
      X = vectorizer.fit_transform(X_copy['text'])
      vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

      return vectorized_df


pipe_preparacion_stemming = Pipeline(steps=[
    ('custom_proccesing', CustomTransformer()),
    ('clf', SVC()),
])

# 3. Entrenamiento de Modelo

## 3.1 Gridsearch with stemming

In [None]:
# Parameters for grid search for each model
parameters = [
    {
        'clf': [SVC()],
        'clf__kernel': ['linear', 'rbf'],
        'clf__C': [1, 10, 100],
    },
    {
        'clf': [RandomForestClassifier()],
        'clf__n_estimators': [50, 100, 200,100, 1000],
        'clf__max_depth': [None, 10, 20],
    },
    {
        'clf': [LogisticRegression()],
        'clf__C': [0.1, 1, 10],
        'clf__penalty': ['l1', 'l2'],
    },
    {
        'clf': [MultinomialNB()],
        'clf__alpha': [0.01, 0.1, 1.0],
    }
]

In [None]:
# Perform grid search with scoring parameter
grid_search_stem = GridSearchCV(pipe_preparacion_stemming, parameters, scoring='accuracy', verbose=1)

In [None]:
%%time
# Fit the grid search
grid_search_stem.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


15 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_

CPU times: user 1min 4s, sys: 454 ms, total: 1min 5s
Wall time: 1min 8s


## 3.2 Evaluación mejor estimador train

In [None]:
grid_search_stem.best_estimator_.steps

[('custom_proccesing', CustomTransformer()),
 ('clf', RandomForestClassifier(max_depth=10, n_estimators=50))]

Vemos que el mejor model es el random forest

In [None]:
def evaluate_model(y_true, y_pred):
  accuracy = accuracy_score(y_true, y_pred)
  precision = precision_score(y_true, y_pred)
  recall = recall_score(y_true, y_pred)
  f1 = f1_score(y_true, y_pred)

  print("Accuracy: {:.2f}%".format(accuracy * 100))
  print("Precision: {:.2f}%".format(precision * 100))
  print("Recall: {:.2f}%".format(recall * 100))
  print("F1-score: {:.2f}%".format(f1 * 100))

  print("\nClassification Report:")
  print(classification_report(y_true, y_pred))

In [None]:
y_pred = grid_search_stem.best_estimator_.predict(X_train)

In [None]:
evaluate_model(y_train, y_pred)

Accuracy: 50.00%
Precision: 0.00%
Recall: 0.00%
F1-score: 0.00%

Classification Report:
              precision    recall  f1-score   support

           0       0.50      1.00      0.67        16
           1       0.00      0.00      0.00        16

    accuracy                           0.50        32
   macro avg       0.25      0.50      0.33        32
weighted avg       0.25      0.50      0.33        32



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 3.3 Gridsearch with lemmatization

In [None]:
# Parameters for grid search for each model
parameters = [
    {
        'clf': [SVC()],
        'clf__kernel': ['linear', 'rbf'],
        'clf__C': [1, 10, 100],
    },
    {
        'clf': [RandomForestClassifier()],
        'clf__n_estimators': [50, 100, 200,100, 1000],
        'clf__max_depth': [None, 10, 20],
    },
    {
        'clf': [LogisticRegression()],
        'clf__C': [0.1, 1, 10],
        'clf__penalty': ['l1', 'l2'],
    },
    {
        'clf': [MultinomialNB()],
        'clf__alpha': [0.01, 0.1, 1.0],
    }
]

In [None]:
# Perform grid search with scoring parameter
grid_search_lemm = GridSearchCV(pipe_preparacion_lemm, parameters, scoring='accuracy', verbose=1)

In [None]:
%%time
# Fit the grid search
grid_search_lemm.fit(X_train, y_train)

Fitting 5 folds for each of 30 candidates, totalling 150 fits


[1;30;43mSe truncaron las últimas líneas 5000 del resultado de transmisión.[0m
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 276, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    return getattr(estimator, method)(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 481, in predict
    return self.steps[-1][1].predict(Xt, **predict_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 820, in predict
    y = super().predict(X)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 433, in predict
    X = self._validate_for_predict(X)
  File "/usr/local/lib/python3.10/dist-packa

CPU times: user 52 s, sys: 1.54 s, total: 53.6 s
Wall time: 53.7 s


## 3.4 Evaluación mejor estimador train

In [None]:
grid_search_stem.best_estimator_.steps

[('custom_proccesing', CustomTransformer()),
 ('clf', RandomForestClassifier(max_depth=10, n_estimators=50))]

In [None]:
y_pred = grid_search_lemm.best_estimator_.predict(X_train)

In [None]:
evaluate_model(y_train, y_pred)

Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        16

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32



## Conclusión Entrenamiento

La opción de lemmatization funcionó mucho mejor para la clasificación en el caso de estudio que la opción de stemming. Obteniendo un resultado perfecto en métricas para dataset train y test.

# 4. Evalución de Modelo

## Evaluación con el dataset test del mejor modelo

In [None]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
      return self

    def quitar_tildes(self,texto):
      # Normalizar el texto en forma NFD (Normalization Form D)
      texto_normalizado = unicodedata.normalize('NFD', texto)
      # Filtrar los caracteres que no son marcas diacríticas
      texto_sin_tildes = ''.join(c for c in texto_normalizado if unicodedata.category(c) != 'Mn')
      # Devolver el texto sin tildes
      return texto_sin_tildes

    def manage_stop_words(self,tonekized_text):
      stop_words = set(stopwords.words("spanish"))
      words = [word for word in tonekized_text if word not in stop_words]
      return words

    def lemmatization(self,tonekized_text):
      lemmatizer = WordNetLemmatizer()
      words = [lemmatizer.lemmatize(word) for word in tonekized_text]
      return words

    def transform(self, X, y=None):
      X_copy = X.copy()

      # Data Cleaning
      X_copy['text'] = X_copy['text'].str.lower()
      X_copy['text'] = X_copy['text'].apply(lambda x : self.quitar_tildes(x))
      X_copy['text'] = X_copy['text'].apply(lambda x : re.sub(r'[^a-z\s]', '', x))
      #Tokenization
      X_copy['text'] = X_copy['text'].apply(lambda x : word_tokenize(x))
      #stop words
      X_copy['text'] = X_copy['text'].apply(lambda x : self.manage_stop_words(x))
      #lemmatization
      X_copy['text'] = X_copy['text'].apply(lambda x : self.lemmatization(x))

      X_copy['text'] = X_copy['text'].apply(lambda x : " ".join(x))

      vectorizer = CountVectorizer()
      X = vectorizer.fit_transform(X_copy['text'])
      vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

      return vectorized_df

pipe_preparacion_lemm = Pipeline(steps=[
    ('custom_proccesing', CustomTransformer()),
])

In [None]:
X_train_trans = pipe_preparacion_lemm.fit_transform(X_train)
X_test_trans = pipe_preparacion_lemm.transform(X_test)
list_of_columns_train = list(X_train_trans.columns)
list_of_columns_test = list(X_test_trans.columns)
intersection_list = list(set(list_of_columns_train).intersection(set(list_of_columns_test)))
X_test_trans = X_test_trans[intersection_list]

for c in list_of_columns_train:
  if c not in list_of_columns_test:
    X_test_trans[c] = 0

X_test_trans = X_test_trans[list_of_columns_train]

  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_trans[c] = 0
  X_test_t

In [None]:
best_model = RandomForestClassifier(max_depth=10, n_estimators=50)
best_model.fit(X_train_trans, y_train)

In [None]:
y_pred = best_model.predict(X_test_trans)
evaluate_model(y_test, y_pred)

Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1-score: 100.00%

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         4

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



**Rendimiento perfecto con el dataset de test**

#5. Creación de scripts solicitados

## 5.1 export model files

In [None]:
with open('model_words.txt', 'w') as file:
  file.writelines(s + '\n' for s in list_of_columns_train)

In [None]:
list_of_words = []
with open('model_words.txt', 'r') as file:
  list_of_words = file.read().split()

In [None]:
joblib.dump(best_model, 'classify.pkl')

['classify.pkl']

In [None]:
joblib.dump(pipe_preparacion_lemm, 'prepare.pkl')

['prepare.pkl']

## 5.2 Intentar con nuevos chats

In [None]:
def match_columns(pipe, data):
  trans_data = pipe.transform(data)
  model_words = []
  with open('model_words.txt', 'r') as file:
    model_words = file.read().split()

  list_of_columns_test = list(trans_data.columns)
  intersection_list = list(set(model_words).intersection(set(list_of_columns_test)))
  X_test_trans = trans_data[intersection_list]

  for c in list_of_columns_train:
    if c not in list_of_columns_test:
      X_test_trans[c] = 0
  X_test_trans = X_test_trans[model_words]

  return X_test_trans

In [None]:
chat1="""
Agente (Laura): ¡Hola! ¿En qué puedo ayudarte hoy?

Cliente (Carlos): Hola, estoy buscando un nuevo smartphone. Mi viejo ya está fallando y necesito uno con una buena cámara y batería duradera.

Agente (Laura): Entiendo, Carlos. Tenemos una amplia gama de smartphones. ¿Tienes alguna marca en mente o alguna característica específica que estés buscando además de una buena cámara y batería?

Cliente (Carlos): He escuchado cosas buenas sobre los modelos de Samsung. También quiero que tenga suficiente espacio de almacenamiento porque tomo muchas fotos y videos.

Agente (Laura): Perfecto, Samsung es una excelente elección. Te recomendaría el Samsung Galaxy S21. Tiene una cámara de 64MP, batería de 4000mAh y viene con 128GB de almacenamiento interno. ¿Qué te parece?

Cliente (Carlos): Suena bien. ¿Cuánto cuesta?

Agente (Laura): Actualmente, el Samsung Galaxy S21 está en oferta por $799. Además, ofrecemos financiación sin intereses si lo prefieres.

Cliente (Carlos): Eso suena genial. ¿Qué otros beneficios ofrece este modelo?

Agente (Laura): Además de la impresionante cámara y batería, el Galaxy S21 cuenta con un procesador muy rápido, pantalla de 6.2 pulgadas con resolución Full HD+ y es resistente al agua y al polvo. También incluye la posibilidad de expandir el almacenamiento con una tarjeta microSD.

Cliente (Carlos): Parece que es justo lo que estoy buscando. ¿Cómo funciona el proceso de financiación?

Agente (Laura): Es muy sencillo. Puedes elegir pagar en cuotas mensuales sin intereses durante 12 meses. Solo necesitas una tarjeta de crédito válida y completar una breve solicitud en línea.

Cliente (Carlos): Perfecto, me interesa la financiación. ¿Cómo podemos proceder?

Agente (Laura): ¡Excelente! Te enviaré un enlace para que completes la solicitud en línea. Una vez aprobada, te enviaremos el smartphone a tu domicilio sin costo adicional en un plazo de 3 a 5 días hábiles. ¿Te parece bien?

Cliente (Carlos): Sí, eso suena perfecto. Gracias por tu ayuda, Laura.

Agente (Laura): Es un placer, Carlos. Te enviaré el enlace de inmediato. ¿Puedes confirmar que recibiste el enlace y que todo está en orden?

Cliente (Carlos): Sí, acabo de recibirlo. Déjame completar la solicitud.

(Pausa mientras Carlos completa la solicitud)

Cliente (Carlos): Listo, ya envié la solicitud.

Agente (Laura): Perfecto, Carlos. Déjame verificar... Sí, tu solicitud ha sido aprobada. Hemos procesado tu pedido y el Samsung Galaxy S21 será enviado a tu dirección. Deberías recibirlo en 3 a 5 días hábiles.

Cliente (Carlos): ¡Genial! Muchas gracias, Laura. Estoy muy emocionado por recibir mi nuevo smartphone.

Agente (Laura): Me alegra escuchar eso, Carlos. Si necesitas algo más o tienes alguna pregunta, no dudes en contactarnos. ¡Que disfrutes tu nuevo teléfono!

Cliente (Carlos): Seguro, gracias de nuevo. ¡Hasta luego!

Agente (Laura): ¡Hasta luego, Carlos! Que tengas un excelente día.
"""

In [None]:
data = pd.DataFrame({"text":[chat1]})
pipe = joblib.load('prepare.pkl')
data = match_columns(pipe, data)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_trans[c] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_trans[c] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_trans[c] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in th

In [None]:
model = joblib.load('classify.pkl')
model.predict(data)

array([1])

In [8]:
!pip freeze | grep scikit-learn

scikit-learn==1.2.2
