In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Importing the Required Libraries

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt  # data visualization
import seaborn as sns  # enhanced data visualization
from tqdm import tqdm  # progress bar

tqdm.pandas()

from sklearn.preprocessing import OneHotEncoder, LabelEncoder  # data preprocessing
from sklearn.model_selection import StratifiedKFold  # cross-validation

from catboost import CatBoostClassifier  # gradient boosting algorithm
from sklearn.metrics import roc_auc_score  # evaluation metric

import warnings  # warning handling
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.options.mode.chained_assignment = None  # suppress chained assignment warning
pd.set_option('display.float_format', lambda x: '%.3f' % x)  # formatting float values
plt.rcParams["figure.figsize"] = (12, 8)  # default figure size for plots
pd.set_option('display.max_columns', None)  # display all columns in DataFrame

### Function to Plot Feature Importance

In [None]:
# Plot the Features Importances
def plotImp(model, X , num = 30, fig_size = (60, 30)):
    feature_imp = pd.DataFrame({'Value':model.feature_importances_,'Feature':X.columns})
    plt.figure(figsize=fig_size)  # Set the figure size for the plot
    sns.set(font_scale = 5)  # Set the font scale for better visibility
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num])
    # Create a bar plot to display feature importances, sorted by value
    plt.title('Catboost Features (avg over folds)')  # Set the title of the plot
    plt.tight_layout()  # Adjust the layout to prevent overlapping elements
    plt.savefig('cb_importances-01.png')  # Save the plot as an image
    plt.show()  # Display the plot
    return feature_imp.sort_values(by="Value", ascending=False)

### Read the Train and Test files into the Notebok

In [None]:
train = pd.read_csv("/kaggle/input/fraud-detection-in-electricity-and-gas-consumption/train/client_train.csv",low_memory=False)
test = pd.read_csv("/kaggle/input/fraud-detection-in-electricity-and-gas-consumption/test/client_test.csv",low_memory=False)
itrain = pd.read_csv("/kaggle/input/fraud-detection-in-electricity-and-gas-consumption/train/invoice_train.csv",low_memory=False)
itest = pd.read_csv("/kaggle/input/fraud-detection-in-electricity-and-gas-consumption/test/invoice_test.csv",low_memory=False)

In [None]:
train

In [None]:
itrain

### Function to Reduce Memory Usage

In [None]:
#Reduce Memory Usage
def reduce_memory_usage(df):
    
    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)

                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    
    return df

### Resort the invoice datasets by client id and date, and convert the time columns of all datasets to date format

In [None]:
itrain = itrain.sort_values(['client_id','invoice_date']).reset_index(drop=True) 
itest = itest.sort_values(['client_id','invoice_date']).reset_index(drop=True)

itrain['invoice_date'] = itrain['invoice_date'].astype('datetime64[ns]')
itest['invoice_date'] = itest['invoice_date'].astype('datetime64[ns]')
train['creation_date'] = train['creation_date'].astype('datetime64[ns]')
test['creation_date'] = test['creation_date'].astype('datetime64[ns]')

In [None]:
def visualize_data(data):
    # Separating numerical and categorical columns
    numerical_cols = data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    categorical_cols = data.select_dtypes(include=['object', 'category']).columns.tolist()

    # Creating countplots for categorical columns
    for col in categorical_cols:
        plt.figure(figsize=(8, 6))
        sns.countplot(x=col, data=data)
        plt.title(f'Countplot of {col}')
        plt.xticks(rotation=45)
        plt.show()

    # Creating scatter plots for numerical columns
    for i in range(len(numerical_cols)):
        for j in range(i + 1, len(numerical_cols)):
            plt.figure(figsize=(8, 6))
            sns.scatterplot(x=numerical_cols[i], y=numerical_cols[j], data=data)
            plt.title(f'Scatter plot between {numerical_cols[i]} and {numerical_cols[j]}')
            plt.show()

    # Creating histograms for numerical columns
    for col in numerical_cols:
        plt.figure(figsize=(8, 6))
        sns.histplot(data[col], kde=True)
        plt.title(f'Histogram of {col}')
        plt.show()

# Visualizing the data



In [None]:
itrain.shape

In [None]:
visualize_data(train)

In [None]:
visualize_data(itrain)

### Create new date features

In [None]:
for dataset in (itrain,itest):
        Date = 'invoice_date'
        dataset[f'{Date}_Date_Int'] = dataset[Date].astype(np.int64) * 1e-9
        dataset[f'{Date}_Day'] = dataset[Date].dt.day
        dataset[f'{Date}_Month'] = dataset[Date].dt.month
        dataset[f'{Date}_Year'] = dataset[Date].dt.year         
        dataset.drop(Date,inplace=True,axis=1)
        
for dataset in (train,test):
        Date = 'creation_date'
        dataset[f'{Date}_Date_Int'] = dataset[Date].astype(np.int64) * 1e-9
        dataset[f'{Date}_Day'] = dataset[Date].dt.day
        dataset[f'{Date}_Month'] = dataset[Date].dt.month
        dataset[f'{Date}_Year'] = dataset[Date].dt.year         
        dataset.drop(Date,inplace=True,axis=1)
    
itrain = reduce_memory_usage(itrain)
itest = reduce_memory_usage(itest)

In [None]:
train

In [None]:
itrain['counter_statue'].dtype

In [None]:
itrain['counter_statue'].value_counts()

In [None]:
itrain['counter_statue'] = itrain['counter_statue'].replace({'2':2,'3':3,5:5,'769':5,'0':0,'5':5,'1':1,'4':4,'A':0,'618':5,'269375':5,'46':5,'420':5})

def string(dataset):
    dataset['counter_statue'] = dataset['counter_statue'].astype(str)
    
train['target'] = train['target'].astype(int)

string(itrain)
string(itest)

In [None]:
itrain['counter_statue'].value_counts()

In [None]:
itrain['counter_statue'].dtype

In [None]:
ID = test['client_id']

# Features Engineering

In [None]:
Aggs_based = ['client_id']
Aggs_num = ['consommation_level_1','consommation_level_2','consommation_level_3',
            'consommation_level_4','months_number']
Aggs_cat = ['reading_remarque','counter_coefficient','tarif_type',
            'counter_number','counter_statue','counter_code',
            'old_index','new_index','counter_type','invoice_date_Date_Int',
            'invoice_date_Day','invoice_date_Month','invoice_date_Year']

In [None]:
for dataset in [itrain,itest]:
    dataset['NewDiffOld'] = dataset['new_index'] - dataset['old_index']
Aggs_cat += ['NewDiffOld']

In [None]:
itrain

In [None]:
for dataset in [train,test]:
    dataset['region_bins'] = dataset['region'].apply(lambda x: 1 if x<=100 else 3 if x>=300 else 2)

In [None]:
itrain.counter_statue.value_counts()

In [None]:
itest.counter_statue.value_counts()

In [None]:
train['MonthSinceAccounCreationt'] = (2023 - train['creation_date_Year'])*12 - train['creation_date_Month']
test['MonthSinceAccounCreation'] = (2023 - test['creation_date_Year'])*12 - test['creation_date_Month']

In [None]:
itrain['counter_code_number_add'] = itrain['counter_code'] + itrain['counter_number'] 
itrain['counter_code_number_sub'] = itrain['counter_code'] - itrain['counter_number'] 
itrain['counter_code_number_prod'] = itrain['counter_code'] * itrain['counter_number'] 
itrain['counter_code_number_div'] = itrain['counter_code'] / itrain['counter_number'] 

itest['counter_code_number_add'] = itest['counter_code'] + itest['counter_number'] 
itest['counter_code_number_sub'] = itest['counter_code'] - itest['counter_number'] 
itest['counter_code_number_prod'] = itest['counter_code'] * itest['counter_number'] 
itest['counter_code_number_div'] = itest['counter_code'] / itest['counter_number'] 

Aggs_num += ['counter_code_number_add','counter_code_number_sub',
             'counter_code_number_prod','counter_code_number_div']

In [None]:
Aggs_cat

In [None]:
le = LabelEncoder()
df = pd.concat([itrain, itest])
for f in (Aggs_cat):
    le.fit(df[f])
    itrain[f] = le.transform(itrain[f])
    itest[f] = le.transform(itest[f])

### Aggregations (Numerical)

In [None]:
def Agg(Feature):
    for client in (train,test):
        dataset = itrain if client.equals(train) else itest
        for feat_1 in Aggs_based:
            client[f'{Feature}_Agg_{feat_1}_mean'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].mean()))
            client[f'{Feature}_Agg_{feat_1}_median'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].median()))
            client[f'{Feature}_Agg_{feat_1}_std'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].std()))
            client[f'{Feature}_Agg_{feat_1}_min'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].min()))
            client[f'{Feature}_Agg_{feat_1}_max'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].max()))
            client[f'{Feature}_Agg_{feat_1}_sum'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].sum()))
            client[f'{Feature}_Agg_{feat_1}_range'] = client[f'{Feature}_Agg_{feat_1}_max'] - client[f'{Feature}_Agg_{feat_1}_min']                
for feat in tqdm(Aggs_num + Aggs_cat):         
    Agg(feat) 

### Aggregations (Categorical)

In [None]:
def Agg(Feature):
    for client in (train,test):
        dataset = itrain if client.equals(train) else itest
        for feat_1 in Aggs_based:
            client[f'{Feature}_Agg_{feat_1}_mode'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].agg(lambda x: pd.Series.mode(x)[0])))
            client[f'{Feature}_Agg_{feat_1}_nunique'] = client[feat_1].map(dict(dataset.groupby(feat_1)[Feature].nunique()))
                
for feat in tqdm(Aggs_cat):         
    Agg(feat)

### Drop Client ID

In [None]:
for dataset in [train,test]:
    dataset.drop('client_id',inplace=True,axis=1)

## Features Encoding

### Label Encoding

In [None]:
feats = list(train.select_dtypes(include=['object','category']).columns)
le = LabelEncoder()
df = pd.concat([train, test])
for f in feats:
    print(f)
    le.fit(df[f])
    train[f] = le.transform(train[f])
    test[f] = le.transform(test[f])

### One-Hot Encoding

In [None]:
feats = ['region','disrict']
df = pd.concat([train,test])
for feat in feats:
    Names = [f'{feat}_{x}' for x in df[feat].value_counts().keys().sort_values()]
    OHE_cols = pd.DataFrame(pd.get_dummies(df[feat]).values,index = df.index, columns = Names)
    df = pd.concat([df,OHE_cols],axis=1)
    
train = df[:train.shape[0]]
test = df[train.shape[0]:]
test.drop('target',inplace=True,axis=1)

### Drop Duplicates and Constant Features

In [None]:
print('Features Before Dropping: ', train.shape)
#Drop Duplicate Features
cols = train.columns
dup = []
for feat_1 in tqdm(cols):
    if (feat_1 in dup):
        continue
    for feat_2 in cols.drop(feat_1):
        if (feat_2 in dup):
            continue
        if (train[feat_1].equals(train[feat_2])):
            train.drop(feat_2,inplace=True,axis=1)
            test.drop(feat_2,inplace=True,axis=1)
            dup.append(feat_2)

#Drop Constant Features
for feat in tqdm(test.columns):
    if ((len(train[feat].value_counts().keys()) == 1) | (len(test[feat].value_counts().keys()) == 1)):
        train.drop(feat,inplace=True,axis=1)
        test.drop(feat,inplace=True,axis=1)
        
print('Features After Dropping: ', train.shape)

### Fill missing values with their Median

In [None]:
train = train.fillna(train.median())
test = test.fillna(test.median())

### Defining Model Parameters

In [None]:
cb_params = {'depth': 8, 'iterations': 5000, 'learning_rate': 0.0164391346853785,'task_type':'GPU',
             'reg_lambda':21.97780539780917,'verbose':0}
cb = CatBoostClassifier(**cb_params, random_state=42)

### Validation Score

In [None]:
import lightgbm as lgb
print('Validating...')
X = train.drop('target', axis=1).values
y = train['target'].values

scores = []
for fold, (train_index, test_index) in enumerate(StratifiedKFold(n_splits=10).split(X, y)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    cb = lgb.LGBMClassifier()  # Initialize LGBMClassifier
    cb.fit(X_train, y_train)  # Fit the model
    y_pred = cb.predict_proba(X_test)[:, 1]  # Predict probabilities for positive class
    score = roc_auc_score(y_test, y_pred)  # Calculate ROC AUC score
    scores.append(score)
    print(score)

print("\nMean:", np.mean(scores), "\nSTD:", np.std(scores))

In [None]:
print('Validating...')

X = train.drop('target',axis=1).values
y = train['target'].values

scores = []                  
for fold, (train_index, test_index) in enumerate(StratifiedKFold(n_splits=10).split(X, y)):
    X_Train, X_Test = X[train_index], X[test_index]
    y_Train, y_Test = y[train_index], y[test_index]
    cb.fit(X_Train,y_Train)
    y_pred = cb.predict_proba(X_Test)[:,1]
    scores.append(roc_auc_score(y_Test,y_pred))
    print(scores[-1])

print("\nMean:",np.mean(scores),"\nSTD: ", np.std(scores))

### Plot Feature Importance

In [None]:
imps = plotImp(cb,train.drop('target',axis=1))

### Drop Features with low importance

In [None]:
#Drop Features with 0 importance
useless_features = imps[imps['Value'] == 0]['Feature'].values
train.drop(useless_features,inplace=True,axis=1)
test.drop(useless_features,inplace=True,axis=1)

### Training and Test Prediction

In [None]:
X = train.drop('target',axis=1)
y = train['target']

cb.fit(X,y)
test['target'] = cb.predict_proba(test)[:,1]

### Submission

In [None]:
submission = pd.DataFrame({"ID": ID ,"Target": test.target.values})
submission.to_csv('FEWT.csv',index=False)
