In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
from scipy.stats import zscore
from datetime import datetime
import google.generativeai as genai
import time


In [None]:
class DataProcessor:
    def __init__(self, dataframe, target_column, api_key='your_ip_key_here'):
        self.df = dataframe.copy()
        self.target_column = target_column        
        self.gen_model = self.gemini_api(api_key)
        

    def handle_missing_values(self):
      
        for col in self.df.columns:
            na_ratio = self.df[col].isna().mean()
            if na_ratio > 0.4:  
                self.df.drop(columns=[col], inplace=True)
            elif self.df[col].dtype == 'object':  
                self.df[col].fillna(self.df[col].mode()[0], inplace=True)
            else:  
                self.df[col].fillna(self.df[col].mean(), inplace=True)

    def lowercase_columns(self):
    
        self.df.update(self.df.select_dtypes(include='object').apply(lambda x: x.str.lower()))


    def remove_duplicates(self):
            
        self.df.drop_duplicates(inplace=True)

    
    def convert_data_types(self): 

        def isdate(string):
            for date_format in ["%Y-%m-%d", "%d-%m-%Y", "%m-%d-%Y", "%Y/%m/%d", "%d/%m/%Y", "%m/%d/%Y",
                                "%Y.%m.%d", "%d.%m.%Y", "%m.%d.%Y", "%d %b %Y", "%d %B %Y", "%b %d, %Y", "%B %d, %Y"]:
                try:
                    datetime.strptime(string, date_format)
                    return True
                except ValueError:
                    continue
            return False
            
        for column in self.df.columns:
            if self.df[column].dtype == "object":
                if self.df[column].str.replace(',', '', regex=False).str.isdigit().all():
                    self.df[column] = self.df[column].str.replace(',', '', regex=False).astype(int)
                
                elif self.df[column].str.replace('.', '', regex=False).str.isdigit().all():                
                    self.df[column] = self.df[column].astype(float)

                elif self.df[column].apply(isdate).mean() >= 0.8:
                    self.df[column] = pd.to_datetime(self.df[column], errors='coerce')


    
    def handle_outliers(self, z_threshold=3):
        
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        z_scores = self.df[numeric_cols].apply(zscore)
        self.df = self.df[(np.abs(z_scores) < z_threshold).all(axis=1)]

    

    def group_infrequent_categories(self, threshold=0.05):
        
        for col in self.df.select_dtypes(include='object').columns:
            freq = self.df[col].value_counts(normalize=True)
            infrequent = freq[freq < threshold].index
            self.df[col] = self.df[col].replace(infrequent, 'Other')

    
    
    def split_data(self, test_size=0.2, random_state=42):
    
        X = self.df.drop(columns=[self.target_column]) 
        y = self.df[self.target_column]                
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )

    def gemini_api(self, api_key='your_ip_key_here'):
    
        genai.configure(api_key = api_key)
        self.gen_model = genai.GenerativeModel("gemini-1.5-flash")

        return self.gen_model
    



    def apply_encoding(self):

        object_columns = self.df.select_dtypes(include='object').columns
        

        for column in object_columns:

            first_10_entries = self.df[column].head(10).tolist()        
            
            entries_str = ", ".join(str(entry) for entry in first_10_entries)

            response = self.gen_model.generate_content(f"Here are the first 10 entries of the column '{column}': {entries_str}. Type 'ordinal' if these categories are ordinal and type 'nominal' if they are nominal ")
            
            retries = 0

            while response not in ['ordinal', 'nominal'] and retries < 3:
                retries+=1
                wait_time = 2 ** retries
                time.sleep(wait_time)
                response = self.gen_model.generate_content(f"Here are the first 10 entries of the column '{column}': {entries_str}. Type 'ordinal' if these categories are ordinal and type 'nominal' if they are nominal.")  
            
            if retries == 3:
                response = 'nominal'

            response = response.lower()
        

            cat_num = self.df[column].nunique() 

            if cat_num < 10 and response == 'ordinal':
                le = LabelEncoder()
                self.X_train[column] = le.fit_transform(self.X_train[column])
                self.X_test[column] = le.transform(self.X_test[column])

        
            elif cat_num < 10 and response == 'nominal': 

                X_combined = pd.concat([self.X_train, self.X_test], axis=0).reset_index(drop=True)
                X_combined = pd.get_dummies(X_combined, columns= [column], dtype=int)

                self.X_train = X_combined.iloc[:len(self.X_train)].reset_index(drop=True)
                self.X_test = X_combined.iloc[len(self.X_train):].reset_index(drop=True)               
                                                  
                
              
                

            elif 10 <cat_num<50 and response == 'nominal' :
                freq_encoding = self.X_train[column].value_counts(normalize=True)

                self.X_train[column] = self.X_train[column].map(freq_encoding)
                self.X_test[column] = self.X_test[column].map(freq_encoding)

                self.X_test[column] = self.X_test[column].map(freq_encoding).fillna(0)

            else:
                te = TargetEncoder()
                self.X_train[column] = te.fit_transform(self.X_train[column])
                self.X_test[column] = te.transform(self.X_test[column])


        
            



    def apply_scaling(self):
        numerical_cols = self.X_train.select_dtypes(include=['int', 'float']).columns

        for column in numerical_cols:
            if self.X_train[column].skew() > 1 or self.X_train[column].skew() < -1:                
                    scaler = StandardScaler()

            else:
                    scaler = MinMaxScaler()

            
            
            self.X_train[column] = scaler.fit_transform(self.X_train[[column]])
            self.X_test[column] = scaler.transform(self.X_test[[column]])




    def process(self, test_size=0.2, z_threshold=3, category_threshold=0.05):
        
        self.handle_missing_values()         
        self.lowercase_columns()            
        self.remove_duplicates()            
        self.convert_data_types()          
        self.handle_outliers(z_threshold)   
        self.group_infrequent_categories(threshold=category_threshold)  
        self.split_data(test_size=test_size) 
        self.apply_encoding()              
        self.apply_scaling()   


    