# Importing Libraries

In [1]:
import pandas as pd
import feature_engine
import numpy as np
#import streamlit as st
import pandas as pd
import dtale
from sklearn.model_selection import train_test_split
from feature_engine.imputation import (
    MeanMedianImputer,
    CategoricalImputer,
    ArbitraryNumberImputer
)
from feature_engine.encoding import WoEEncoder
from feature_engine.selection import DropConstantFeatures
from feature_engine.creation import CyclicalFeatures
from feature_engine.encoding import OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.selection import DropCorrelatedFeatures
from feature_engine.outliers import Winsorizer, ArbitraryOutlierCapper, OutlierTrimmer
import warnings
warnings.filterwarnings('ignore')

# Read Data and its shape

In [2]:
df = pd.read_csv("dirty_deputies_v2.csv",na_values='Nan')
df.shape

df = df.sample(5000)
df.shape

(5000, 18)

# Drop features having x% null values

In [3]:
def drop_columns_with_high_nan(df, threshold):
    nan_percentages = df.isnull().mean() * 100
    columns_to_drop = nan_percentages[nan_percentages > threshold].index
    print(columns_to_drop)
    df_dropna = df.drop(columns=columns_to_drop)
    return df_dropna

In [4]:
df = drop_columns_with_high_nan(df, 60)
df.shape

Index(['party_ideology4'], dtype='object')


(5000, 17)

# Drop Duplicates

In [5]:
df.drop_duplicates(inplace=True)
df.shape

(4991, 17)

# Change datatypes if required

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4991 entries, 254361 to 46014
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   deputy_name         4991 non-null   object 
 1   deputy_state        4980 non-null   object 
 2   political_party     4980 non-null   object 
 3   refund_description  4991 non-null   object 
 4   company_name        4991 non-null   object 
 5   company_id          4326 non-null   float64
 6   refund_date         4900 non-null   object 
 7   refund_value        4991 non-null   float64
 8   party_pg            4980 non-null   object 
 9   party_en            4980 non-null   object 
 10  party_tse           4980 non-null   float64
 11  party_regdate       4980 non-null   object 
 12  party_nmembers      4980 non-null   float64
 13  party_ideology1     4980 non-null   object 
 14  party_ideology2     4233 non-null   object 
 15  party_ideology3     2463 non-null   object 
 16  party

In [7]:
def change_datatype(df, column_name, new_datatype):

    try:
        if new_datatype == 'int':
            df[column_name] = pd.to_numeric(df[column_name], errors='coerce').astype(int)
        elif new_datatype == 'float':
            df[column_name] = pd.to_numeric(df[column_name], errors='coerce').astype(float)
        elif new_datatype == 'str':
            df[column_name] = df[column_name].astype(str)
        elif new_datatype == 'bool':
            df[column_name] = df[column_name].astype(bool)
        elif new_datatype == 'datetime':
            df[column_name] = pd.to_datetime(df[column_name], errors='coerce')
        else:
            print("Unsupported data type.")
    except ValueError as e:
        print(f"Error converting column '{column_name}' to {new_datatype} : {e}")

# Input required here

In [8]:
change_datatype(df,'company_id','str')
change_datatype(df,'party_regdate','datetime')
change_datatype(df,'refund_date','datetime')
change_datatype(df,'party_tse','int')
change_datatype(df,'party_nmembers','int')

Error converting column 'party_tse' to int : Cannot convert non-finite values (NA or inf) to integer
Error converting column 'party_nmembers' to int : Cannot convert non-finite values (NA or inf) to integer


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4991 entries, 254361 to 46014
Data columns (total 17 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   deputy_name         4991 non-null   object        
 1   deputy_state        4980 non-null   object        
 2   political_party     4980 non-null   object        
 3   refund_description  4991 non-null   object        
 4   company_name        4991 non-null   object        
 5   company_id          4991 non-null   object        
 6   refund_date         4497 non-null   datetime64[ns]
 7   refund_value        4991 non-null   float64       
 8   party_pg            4980 non-null   object        
 9   party_en            4980 non-null   object        
 10  party_tse           4980 non-null   float64       
 11  party_regdate       4980 non-null   datetime64[ns]
 12  party_nmembers      4980 non-null   float64       
 13  party_ideology1     4980 non-null   object     

# Split the Dataset

# Input required

In [10]:
def split_data(df,target_column,test_size=0.2):
    X = df.drop(columns=[target_column]) 
    y = df[target_column] 

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    xy_test = pd.concat([X_test,y_test],axis=1)
    xy_test.dropna(inplace=True)
    X_test = xy_test.drop(columns=[target_column]) 
    y_test = xy_test[target_column]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_data(df,"party_position",test_size=0.2)

In [11]:
print("Training set (X):", X_train.shape)
print("Testing set (X):", X_test.shape)
print("Training set (y):", y_train.shape)
print("Testing set (y):", y_test.shape)

Training set (X): (3992, 16)
Testing set (X): (449, 16)
Training set (y): (3992,)
Testing set (y): (449,)


# EDA through Dtale Library

In [12]:
train_eda = pd.concat([X_train,y_train],axis=1)
d = dtale.show(train_eda)
d.open_browser()

In [13]:
#d._url #(get url for the browser)

## Shutting down D-Tale process
#d.kill()

# Drop Low variance features

# Input required

In [14]:
def low_variance_features(X_train,X_test,variables=None,threshold=0.9):
    earlier_cols = list(X_train.columns)
    dcf = DropConstantFeatures(tol = threshold,missing_values='ignore',variables=variables)
    X_train_ = dcf.fit_transform(X_train)
    later_cols = list(X_train_.columns)
    diff = list(set(earlier_cols) - set(later_cols))
    X_test_ = X_test.drop(diff,axis=1)
    return X_train_,X_test_

In [15]:
X_train,X_test = low_variance_features(X_train,X_test,variables=None,threshold=0.9)

In [16]:
X_train.shape,X_test.shape

((3992, 16), (449, 16))

# Drop Highly correlated features

# Input required

In [17]:
def high_correlation_features(X_train,X_test,variables=None,threshold=0.9):
    earlier_cols = list(X_train.columns)
    dcf = DropCorrelatedFeatures(threshold = threshold,missing_values='ignore',variables=variables)
    X_train_ = dcf.fit_transform(X_train)
    later_cols = list(X_train_.columns)
    diff = list(set(earlier_cols) - set(later_cols))
    X_test_ = X_test.drop(diff,axis=1)
    return X_train_,X_test_

In [18]:
X_train,X_test = high_correlation_features(X_train,X_test,variables=None,threshold=0.8)

In [19]:
X_train.shape,X_test.shape

((3992, 16), (449, 16))

# Null value

In [20]:
null_cols = X_train.columns[X_train.isnull().any()].tolist() # this returns all columns having null columns

In [21]:
def column_datatypes(df, columns):
    """
    This function returns a dictionary of object and numerical type columns
    """
    result = {'object': [], 'numerical': []}
    for column in columns:
        dtype = df[column].dtype
        if dtype == 'object':
            result['object'].append(column)
        elif dtype == 'int64' or dtype == 'float64':
            result['numerical'].append(column)
    return result

In [22]:
impute_dict = column_datatypes(X_train,null_cols) 

# Numerical value imputation

# Input required

In [23]:
def numerical_missing_imputation(df,only_numerical,method = "median"):
    mmi = MeanMedianImputer(imputation_method=method,variables=only_numerical)
    mmi.fit(df[only_numerical])
    df[only_numerical] = mmi.transform(df[only_numerical])
    return df

In [24]:
only_numerical = impute_dict['numerical']
X_train = numerical_missing_imputation(X_train,only_numerical,method = "median")

# Categorical value Imputation

# Input required

In [25]:
def categorical_missing_imputation(df,variables,method = "frequent",fill_value=None):
    if method == "frequent":
        ci = CategoricalImputer(imputation_method='frequent',variables=variables)
        ci.fit(df[variables])
        df[variables] = ci.transform(df[variables])
        return df
    elif method == "missing":
        ci = CategoricalImputer(imputation_method='missing',fill_value=fill_value ,variables=variables)
        ci.fit(df[variables])
        df[variables] = ci.transform(df[variables])
        return df

In [26]:
frequent_variables = ['deputy_state', 'political_party','party_pg','party_en','party_ideology1']
empty_string_variables = ["party_ideology2","party_ideology3"]

In [27]:
X_train = categorical_missing_imputation(X_train,frequent_variables,method = "frequent")
X_train = categorical_missing_imputation(X_train,empty_string_variables,method = "missing",fill_value=" ")

# Outlier Imputation

In [28]:
def outlier_imputation(df,outliers_list):
    wz = Winsorizer(capping_method='gaussian', tail='both', fold=3,variables=outliers_list)
    wz.fit(df[outliers_list])
    df[outliers_list] = wz.transform(df[outliers_list])
    return df

In [29]:
outliers_list = impute_dict["numerical"]
X_train = outlier_imputation(X_train,outliers_list)

# Categorical Encoding:

In [30]:
result = {'object': [], 'numerical': []}
for column in list(X_train.columns):
    dtype = X_train[column].dtype
    if dtype == 'object':
        result['object'].append(column)
    elif dtype == 'int64' or dtype == 'float64':
        result['numerical'].append(column)

In [31]:
encode_dict = {}
for i in result["object"]:
    encode_dict[i] = len(list(pd.unique(df[i])))

# Input required

In [32]:
def categorical_encoding(X_train,X_test,dict_,unique_count=30):
    OHE_cols = [x for x in dict_ if dict_[x]<=unique_count]
    frequency_count_cols = [x for x in dict_ if dict_[x]>unique_count]
    #print(frequency_count_cols)

    if len(OHE_cols)+len(frequency_count_cols)>2:
        print("OHE & freq",True)
        encoder_ohe = OneHotEncoder(variables=OHE_cols,ignore_format=True)
        X_train = encoder_ohe.fit_transform(X_train)
        X_test = encoder_ohe.transform(X_test)

        encoder_freq = CountFrequencyEncoder(encoding_method='frequency',variables=frequency_count_cols,ignore_format=True)
        X_train = encoder_freq.fit_transform(X_train)
        X_test = encoder_freq.transform(X_test)
        return X_train,X_test

    elif len(frequency_count_cols)>0 and len(OHE_cols)==0:
        print("count_f",True)
        encoder_freq = CountFrequencyEncoder(encoding_method='frequency',variables=frequency_count_cols,ignore_format=True)
        X_train = encoder_freq.fit_transform(X_train)
        X_test = encoder_freq.transform(X_test)
        return X_train,X_test
    
    elif len(frequency_count_cols)>0 and len(OHE_cols)==0:
        print("OHE",True)
        encoder_ohe = OneHotEncoder(variables=OHE_cols,ignore_format=True)
        X_train = encoder_ohe.fit_transform(X_train)
        X_test = encoder_ohe.transform(X_test)
        return X_train,X_test

In [33]:
X_train,X_test = categorical_encoding(X_train,X_test,encode_dict)

OHE & freq True


# Cyclical Encoding

# Input required

In [34]:
def cyclical_encoding(Xy_train,Xy_test,date_col,target_column = list(pd.DataFrame(y_train).columns)[0]):
    Xy_train["Month"] = pd.DatetimeIndex(Xy_train[date_col]).month 
    Xy_train["Day"] = pd.DatetimeIndex(Xy_train[date_col]).day

    Xy_test["Month"] = pd.DatetimeIndex(Xy_test[date_col]).month 
    Xy_test["Day"] = pd.DatetimeIndex(Xy_test[date_col]).day

    Xy_train.dropna(subset=[date_col],inplace=True)
    Xy_test.dropna(subset=[date_col],inplace=True)

    X_train = Xy_train.drop(columns=[target_column]) 
    y_train = Xy_train[target_column]

    X_test = Xy_test.drop(columns=[target_column]) 
    y_test = Xy_test[target_column]

    cyclical = CyclicalFeatures(variables=["Month","Day"], drop_original=True)
    X_train = cyclical.fit_transform(X_train)
    X_test = cyclical.transform(X_test)

    return X_train,X_test,y_train,y_test

In [35]:
Xy_train = pd.concat([X_train,y_train],axis=1)
Xy_test = pd.concat([X_test,y_test],axis=1)
X_train,X_test,y_train,y_test = cyclical_encoding(Xy_train,Xy_test,"party_regdate")

# Normalize/standardize the data:

# Input required

In [36]:
def scale_features(X_train,X_test, features, method='normalization'):

    if method == 'normalization':
        scaler = MinMaxScaler()
    elif method == 'standardization':
        scaler = StandardScaler()
    else:
        raise ValueError("Unsupported scaling method. Please use 'normalization' or 'standardization'.")

    X_train[features] = scaler.fit_transform(X_train[features])
    X_test[features] = scaler.transform(X_test[features])

    return X_train,X_test


In [37]:
X_train,X_test = scale_features(X_train,X_test, result['numerical'], method='normalization')

# Label encode target class (OHE/Labelencoder):

# Input required

In [38]:
def encode_target_variable(y_train, y_test, method='label_encoding'):
    if method == 'label_encoding':
        label_encoder = LabelEncoder()
        
        y_train_encoded = label_encoder.fit_transform(y_train)
        y_test_encoded = label_encoder.transform(y_test)
        return pd.DataFrame(y_train_encoded), pd.DataFrame(y_test_encoded)
    
    elif method == 'one_hot_encoding':
        one_hot_encoder = OneHotEncoder(ignore_format=False)
        y_train_encoded = one_hot_encoder.fit_transform(pd.DataFrame(y_train))
        y_test_encoded = one_hot_encoder.transform(pd.DataFrame(y_test))
        return y_train_encoded, y_test_encoded
    
    else:
        raise ValueError("Unsupported encoding method. Please use 'label_encoding' or 'one_hot_encoding'.")


In [39]:
y_train_encoded, y_test_encoded= encode_target_variable(y_train, y_test, method='label_encoding')

In [40]:
print("Training set (X):", X_train.shape)
print("Testing set (X):", X_test.shape)
print("Training set (y):", y_train_encoded.shape)
print("Testing set (y):", y_test_encoded.shape)

Training set (X): (3982, 185)
Testing set (X): (449, 185)
Training set (y): (3982, 1)
Testing set (y): (449, 1)


In [41]:
X_train.to_csv("X_train.csv")
X_test.to_csv("X_test.csv")
y_train_encoded.to_csv("y_train.csv")
y_test_encoded.to_csv("y_test.csv")

In [42]:
# Drop columns (Optional)