In [1]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import OneHotEncoder
from gensim.models import Word2Vec
import sys
sys.path.append("C:/Users/Fernando/Desktop/Proyecto_Final_ML/src")
import preprocess_functions as pr
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("../data/data_train.csv", index_col=0)
data_company_response_binary = data[~data["Company response"].isin(["In progress", "Untimely response"])]
estado_respuesta = {'Closed with monetary relief': 0, 
       'Closed with non-monetary relief':0, 'Closed with explanation': 1, 'Closed': 1}

data_company_response_binary["Company response"] = data_company_response_binary["Company response"].map(estado_respuesta)


X = data_company_response_binary.drop(columns=["Company response"])
y = data_company_response_binary["Company response"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_company_response_binary["Company response"] = data_company_response_binary["Company response"].map(estado_respuesta)


### Funciones Sub-product



In [3]:
# Filling NaN

filling_na = FunctionTransformer(lambda x: x.fillna("No Subproduct"),validate=False)

# data["Sub-product"] = filling_na.transform(data["Sub-product"])

In [4]:
# Codificacion con Target Encoder y sustitución de la columna Sub-issue

class FillEncodeColumn(BaseEstimator, TransformerMixin):
    def __init__(self, column, smooth="auto", cv=5, categories = "auto", random_state=42):
        self.column = column
        self.smooth = smooth
        self.cv = cv
        self.random_state = random_state
        self.categories = categories
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder= TargetEncoder(
            smooth=self.smooth,
            cv=self.cv,
            random_state=self.random_state,
            categories = self.categories 
        )
        self.encoder.fit(X[[self.column]], y)
        self.n_features_in_ = X.shape[1]
        return self

    def transform(self, X):
        assert X.shape[1] == self.n_features_in_
        encoded = self.encoder.transform(X[[self.column]])
        X[self.column] = encoded
        return X



## Funciones Dates

In [5]:
date_converter_received = FunctionTransformer(pr.date_converter, kw_args={"column": "Date received"}, validate=False)
date_converter_sent = FunctionTransformer(pr.date_converter, kw_args={"column": "Date sent to company"}, validate=False)
day_of_week_month_received = FunctionTransformer(pr.day_of_week_and_month, kw_args={"column_date": "Date received"}, validate = False)
day_of_week_month_sent = FunctionTransformer(pr.day_of_week_and_month, kw_args={"column_date": "Date sent to company"}, validate = False)

# Al contrario que el notebook de preprocessing, he incorporado el drop
# en la función, por lo que elimina la columna original al aplicar la función

## Funciones Product

In [6]:
class ProductEncoder(BaseEstimator, TransformerMixin):

    def __init__(self,column, sparse_output = False, dtype=int):
        self.column = column
        self.sparse_output = sparse_output
        self.dtype = dtype
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=self.sparse_output,
                                     dtype=self.dtype)
        self.encoder.fit(X[[self.column]], y)
        self.n_features_in_ = X.shape[1]
        return self
    
    def transform(self, X):
        assert X.shape[1] == self.n_features_in_
        encoded= self.encoder.transform(X[[self.column]])
        encoded = pd.DataFrame(encoded, 
                               columns = self.encoder.get_feature_names_out([self.column]), 
                               index=X.index)
        X = pd.concat([X, encoded], axis=1)
        return X
        

## Funciones State

In [7]:
regions_and_divisions = FunctionTransformer(pr.region_and_division, kw_args={"column_state": "State"}, validate=False)

# Las columnas de salida se llaman regions y divisions

In [8]:
class RegionDivisionEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self,column, sparse_output = False, dtype=int):
        self.column = column
        self.sparse_output = sparse_output
        self.dtype = dtype
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=self.sparse_output,
                                     dtype=self.dtype)
        self.encoder.fit(X[[self.column]], y)
        self.n_features_in_ = X.shape[1]
        return self
    
    def transform(self, X):
        assert X.shape[1] == self.n_features_in_
        encoded= self.encoder.transform(X[[self.column]])
        encoded = pd.DataFrame(encoded, 
                               columns = self.encoder.get_feature_names_out([self.column]), 
                               index=X.index)
        X = pd.concat([X, encoded], axis=1)
        X = X.drop(columns=[self.column])
        return X
    
    # Recuerda que hay que hacerlo para la columna regions y divisions

## Functions Issue

In [9]:
# Limpieza de texto

text_cleaner = FunctionTransformer(pr.tokenize_column,kw_args={"column": "Issue"}, validate=False)

In [10]:


class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    """
    Convierte una columna de listas de tokens en embeddings promedio usando Word2Vec.
    """
    def __init__(self, column="Issue_tokens", vector_size=25, window=5, min_count=1, workers=4):
        self.column = column
        self.vector_size = vector_size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.model_ = None  # Modelo Word2Vec entrenado

    def fit(self, X, y=None):
        self.model_ = Word2Vec(
            vector_size=self.vector_size,
            window=self.window,
            min_count=self.min_count,
            workers=self.workers
        )
        return self

    def transform(self, X):
        # Función para obtener embedding promedio de una lista de tokens
        def sentence_vector(tokens):
            vectors = [self.model_.wv[word] for word in tokens if word in self.model_.wv]
            if len(vectors) == 0:
                return np.zeros(self.vector_size)
            return np.mean(vectors, axis=0)

        # Aplicar transformación
        embeddings = X[self.column].apply(sentence_vector)
        embedding_matrix = np.vstack(embeddings.values)
        
        # Crear DataFrame de embeddings
        embedding_df = pd.DataFrame(
            embedding_matrix,
            columns=[f"embedding_{i}" for i in range(self.vector_size)],
            index=X.index
        )

        # Concatenar embeddings al DataFrame original (sin la columna de tokens original)
        X_transformed = pd.concat([X.drop(columns=[self.column], errors='ignore'), embedding_df], axis=1)
        return X_transformed

# Hay que hacerlo con la columna Issue_tokens

## Function company

In [11]:
company_type_converter = FunctionTransformer(pr.company_type_converter, kw_args={"column": "Product"})

In [12]:
class Company_type_from_product(BaseEstimator, TransformerMixin):

    def __init__(self,column, sparse_output = False, dtype=int):
        self.column = column
        self.sparse_output = sparse_output
        self.dtype = dtype
        self.encoder = None

    def fit(self, X, y=None):
        self.encoder = OneHotEncoder(sparse_output=self.sparse_output,
                                     dtype=self.dtype)
        self.encoder.fit(X[[self.column]], y)
        self.n_features_in_ = X.shape[1]
        return self
    
    def transform(self, X):
        assert X.shape[1] == self.n_features_in_
        encoded= self.encoder.transform(X[[self.column]])
        encoded = pd.DataFrame(encoded, 
                               columns = self.encoder.get_feature_names_out([self.column]), 
                               index=X.index)
        X = pd.concat([X, encoded], axis=1)
        X = X.drop(columns=[self.column])
        return X
    

# Hay que hacerlo para la columna Company_type

In [13]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns_to_drop=None):
        self.columns_to_drop = columns_to_drop

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=self.columns_to_drop, errors="ignore")

# PIPELINE

In [None]:
pipeline = Pipeline([
    ("fill_na", filling_na),
    ("encoder", FillEncodeColumn(column="Sub-product")),
    ("date_transformation_received", date_converter_received),
    ("date_transformation_sent", date_converter_sent),
    ("day_week_received", day_of_week_month_received),
    ("day_week_sent", day_of_week_month_sent),
    ("product_encoder", ProductEncoder(column="Product")),
    ("reg_and_div", regions_and_divisions),
    ("red_enc", RegionDivisionEncoder(column="regions")),
    ("div_enc", RegionDivisionEncoder(column="divisions")),
    ("cleaner text", text_cleaner),
    ("embedding creation", Word2VecVectorizer(column="Issue_tokens")),
    ("company type", company_type_converter),
    ("company from porduct", Company_type_from_product(column="Company_type")),
    ("Drop columns", ColumnDropper(columns_to_drop=["Complaint ID","Sub-issue", "ZIP code", "Company", 
             "Timely response?", "Consumer disputed?"]))
])

X_train = pipeline.fit_transform(X_train, y_train )


In [15]:
X_train

Unnamed: 0,Sub-product,Date received_day_of_month,Date received_day_week,Date received_weekend,Date sent to company_day_of_month,Date sent to company_day_week,Date sent to company_weekend,Product_Bank account or service,Product_Consumer loan,Product_Credit card,...,embedding_21,embedding_22,embedding_23,embedding_24,Company_type_Bank,Company_type_Bureau,Company_type_Collector,Company_type_Fintech,Company_type_Lender,Company_type_Other
10239,0.898418,12,3,0,12,3,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0
26727,0.927206,6,1,0,6,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0
22429,0.927206,15,3,0,15,3,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0
6631,0.912154,20,4,0,23,0,0,1,0,0,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
23535,0.725821,13,1,0,13,1,0,0,0,1,...,0.0,0.0,0.0,0.0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14063,0.725821,4,2,0,9,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
11874,0.725821,9,0,0,9,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
191,0.725821,14,5,1,14,5,1,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
10786,0.725821,11,2,0,18,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0


In [16]:
X_test = pipeline.transform(X_test)




In [17]:
X_test

Unnamed: 0,Sub-product,Date received_day_of_month,Date received_day_week,Date received_weekend,Date sent to company_day_of_month,Date sent to company_day_week,Date sent to company_weekend,Product_Bank account or service,Product_Consumer loan,Product_Credit card,...,embedding_21,embedding_22,embedding_23,embedding_24,Company_type_Bank,Company_type_Bureau,Company_type_Collector,Company_type_Fintech,Company_type_Lender,Company_type_Other
4795,0.889836,25,2,0,25,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
26352,0.725821,6,1,0,9,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
22570,0.708945,15,3,0,21,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
12375,0.708945,7,5,1,11,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
3783,0.857732,27,4,0,4,2,0,0,1,0,...,0.0,0.0,0.0,0.0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12112,0.725821,9,0,0,9,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
18004,0.725821,27,1,0,27,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0,1,0,0,0,0
3574,0.708945,27,4,0,2,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
25722,0.889155,7,2,0,7,2,0,0,0,0,...,0.0,0.0,0.0,0.0,0,0,1,0,0,0
