In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

class DateParser:
    """Parse date columns and create temporal features"""
    def __init__(self):
        self.date_columns = []

    def fit(self, X, y=None):
        self.date_columns = []
        for col in X.columns:
            try:
                pd.to_datetime(X[col], errors='raise')
                self.date_columns.append(col)
            except:
                pass
        return self

    def transform(self, X):
        """
        Transforms the input data by parsing date columns and creating temporal features.

        Args:
            X (pd.DataFrame): The input data.

        Returns:
            pd.DataFrame: The transformed data with date columns parsed and temporal features added.
        """
        X = X.copy()
        for col in self.date_columns:
            X[col] = pd.to_datetime(X[col], errors='coerce')
            X[f'{col}_year'] = X[col].dt.year
            X[f'{col}_month'] = X[col].dt.month
            X[f'{col}_day'] = X[col].dt.day
        return X.drop(columns=self.date_columns)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
class TextCleaner:
    """Clean and preprocess text columns"""
    def __init__(self, lang='english'):
        self.stop_words = set(stopwords.words(lang))
        self.lemmatizer = WordNetLemmatizer()

    def clean_text(self, text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = text.split()
        tokens = [self.lemmatizer.lemmatize(word) for word in tokens
                 if word not in self.stop_words]
        return ' '.join(tokens)

    def transform(self, X):
        for col in X.select_dtypes(include=['object']).columns:
            X[col] = X[col].apply(self.clean_text)
        return X

In [3]:
# Sample dataset
data = {
    'age': [25, 30, 35, None, 40, 150, 45],
    'salary': [50000, 60000, None, 70000, 80000, 90000, 1000000],
    'gender': ['M', 'F', 'F', 'M', None, 'F', 'M'],
    'city': ['New York', 'Los Angeles', 'NY', 'LA', 'NYC', 'San Fran', ''],
    'purchase_date': ['2023-01-15', '2022-13-01', '2021-07-23',
                    'invalid', '2020-05-12', None, '2023-03-30']
}
df = pd.DataFrame(data)


In [4]:
# 1. Date parsing
date_parser = DateParser()
date_parser.fit(df)  # Fit the DateParser
df = date_parser.transform(df)  # Transform the data

# 2. Text cleaning
text_cleaner = TextCleaner()
df = text_cleaner.transform(df)

# 3. Handle missing values
num_cols = df.select_dtypes(include='number').columns
cat_cols = df.select_dtypes(include='object').columns

# Numerical imputation
num_imputer = SimpleImputer(strategy='median')
df[num_cols] = num_imputer.fit_transform(df[num_cols])

# Categorical imputation
cat_imputer = SimpleImputer(strategy='most_frequent', fill_value='missing')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# 4. Outlier handling using IQR
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)

# Display cleaned data
print("Cleaned Data:")
print(df)


  pd.to_datetime(X[col], errors='raise')
  pd.to_datetime(X[col], errors='raise')


Cleaned Data:
  gender         city purchase_date  age_year  age_month  age_day  \
0            new york                  1970.0        1.0      1.0   
1      f  los angeles                  1970.0        1.0      1.0   
2      f           ny                  1970.0        1.0      1.0   
3                  la       invalid    1970.0        1.0      1.0   
4   none          nyc                  1970.0        1.0      1.0   
5      f     san fran          none    1970.0        1.0      1.0   
6                                      1970.0        1.0      1.0   

   salary_year  salary_month  salary_day  
0       1970.0           1.0         1.0  
1       1970.0           1.0         1.0  
2       1970.0           1.0         1.0  
3       1970.0           1.0         1.0  
4       1970.0           1.0         1.0  
5       1970.0           1.0         1.0  
6       1970.0           1.0         1.0  
