In [None]:
import pandas as pd
import numpy as np
import missingno as msno 
import seaborn as sns
import matplotlib.pyplot as plt 

#import warnings
#warnings.filterwarnings('ignore')



In [None]:
# client data
df_clients =pd.read_csv('data/client_train.csv')

# invoice score
df_invoice =pd.read_csv('data/invoice_train.csv')

In [None]:
df_clients.head()

In [None]:
df_invoice.head()

## Column descriptions:

### Client Data

* Client_id: Unique id for client
* District: District where the client is
* Client_catg: Category client belongs to
* Region: Area where the client is
* Creation_date: Date client joined
* Target: fraud:1 , not fraud: 0

### Invoice Data

* Client_id: Unique id for the client
* Invoice_date: Date of the invoice
* Tarif_type: Type of tax
* Counter_number:
* Counter_statue: takes up to 5 values such as working fine, not working, on hold statue, ect
* Counter_code:
* Reading_remarque: notes that the STEG agent takes during his visit to the client (e.g: If the counter shows something wrong, the agent gives a bad score)
* Counter_coefficient: An additional coefficient to be added when standard consumption is exceeded
* Consommation_level_1: Consumption_level_1
* Consommation_level_2: Consumption_level_2
* Consommation_level_3: Consumption_level_3
* Consommation_level_4: Consumption_level_4
* Old_index: Old index
* New_index: New index
* Months_number: Month number
* Counter_type: Type of counter

 the consumption level refers to the threshold of consumption to which a certain price is attributed

In [None]:
#Y = coffee_quality["quality_score"]

# Data cleaningin and feature engineering

In [None]:
df_clients.info()

In [None]:
df_invoice.info()

In [None]:
df_invoice.isnull().sum()

In [None]:
df_invoice.describe().T

In [None]:
df_joined =  pd.merge(df_clients, df_invoice, on="client_id", how="left")
#df_clients.join(df_invoice, on='client_id', how='left')

In [None]:
df_joined.head(20)

In [None]:
print(df_joined.client_id.nunique())
df_joined.counter_number.nunique()

In [None]:
df_joined.nunique()

In [None]:
df_joined.counter_type.unique()

In [None]:
df_joined.client_catg.unique()

In [None]:
df_joined.tarif_type.unique()

In [None]:
df_joined.groupby('counter_statue').count()

In [None]:
df_joined.groupby('counter_code').count()

In [None]:
df_joined.counter_code.unique()

In [None]:
df_joined.reading_remarque.unique()

In [None]:
df_joined.groupby('reading_remarque').count()

In [None]:
df_joined.counter_coefficient.unique()

In [None]:
df_joined.old_index.unique()

In [None]:
df_joined.info()

## To Dos

* remove client_id column
* turn into dummies: client_catg, tarif_type
* change target to 0,1 - DONE
* turn creation_date, invoice_date  into datetime or something else - DONE
* feature engineering: number of counters per client. then drop client_id, counter_number
* counter_statue: turn strings 0-5 into int, check percentage of values not 0-5, check for pattern, remove - DONE
* counter_code: either dummies or find info on steg site or drop?
* reading_remarque: clean? turn into dummies
* counter_coefficient: try both?: treat numerically and as dummies?
* rescale consommation_level1 ... _4
* drop index old and new
* rescale months_number
* turn into dummy counter_type

In [None]:
# check how many duplicated rows exist in the data frame
df_joined.duplicated().value_counts()

# there were 11 duplicatge rows, drop duplicates
df_joined.drop_duplicates(inplace=True)

In [None]:
df_joined['counter_statue'].unique()

In [None]:
# counter_statue: turn strings 0-5 into int, check percentage of values not 0-5, check for pattern, remove

df_joined['counter_statue'] = df_joined['counter_statue'].map({
    '0': 0,
    0: 0, 
    1: 1,
    2 : 2,
    3: 3,
    4: 4,  
    5: 5,
    '5': 5,
    '1': 1,
    '4': 4, 
    'A': np.nan,
    618: np.nan, 
    269375: np.nan,
    46: np.nan, 
    420: np.nan,
    769: np.nan, 
    })
df_joined['counter_statue'].nunique()

In [None]:
# check for missing distribution

#msno.matrix(df_joined)

In [None]:
# calculate fraction of data we would lose : 0.001%

print(f"numbers of rows : {df_joined.shape[0]}")
print(f"missing values in counter statue : {round(df_joined.counter_statue.isna().sum()/df_joined.shape[0]*100,4)} %")

# copy df
df_processed = df_joined.copy()
# drop NaN
df_processed.dropna(inplace=True, axis=0)


In [None]:
df_joined.isna().sum()

In [None]:
df_processed.isna().sum()

In [None]:
# change target from float to int (0,1)
df_processed.target = df_processed.target.astype(int)
df_processed.target.unique()

In [None]:
df_processed.head()

In [None]:
# turn columns invoice date and creation date to datetime

df_processed['invoice_date'] = pd.to_datetime(df_processed['invoice_date'], format='%Y-%m-%d')
df_processed['creation_date'] = pd.to_datetime(df_processed['creation_date'], format='%d/%m/%Y')

## Establishing Baseline Model

In [None]:
df_processed.columns

In [None]:
# heatmap
fig = plt.figure(figsize=(20,10))
sns.heatmap(df_processed[['disrict', 'client_catg', 'region', 'creation_date',
       'target', 'invoice_date', 'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
       'consommation_level_3', 'consommation_level_4', 'old_index',
       'new_index', 'months_number']].corr(), annot=True)



In [None]:
# the parplot took a lot of time so we did not see it so far and left it for now

# pairplot

#fig = plt.figure(figsize=(20,10))
#sns.pairplot(df_processed[['disrict', 'client_catg', 'region', 'creation_date', 'invoice_date', 'tarif_type', 'counter_number',
       'counter_statue', 'counter_code', 'reading_remarque',
       'counter_coefficient', 'consommation_level_1', 'consommation_level_2',
       'consommation_level_3', 'consommation_level_4', 'months_number', 'target']], hue='target')

In [None]:
# since our target has the highest correlation with the client category, we will take a closer look at that: 
df_processed.client_catg.value_counts()

In [None]:
# plot distribution of target for each client category individually, descending by clients in category:

fig = plt.figure(figsize=(20,10))
sns.histplot(data= df_processed.query('client_catg == 11'), x = 'client_catg', hue='target', stat='percent');

In [None]:
fig = plt.figure(figsize=(20,10))
sns.histplot(data= df_processed.query('client_catg == 51'), x = 'client_catg', hue='target', stat='percent')

In [None]:
fig = plt.figure(figsize=(20,10))
sns.histplot(data= df_processed.query('client_catg == 12'), x = 'client_catg', hue='target', stat='percent')

In [None]:
# the highest amount of fraud happens by clients assigned to category 51, so our very fist guess for a hypothesis and baseline mode ist:

# client category is the best predictor for fraud

## Next Steps
* run baseline model and print the different scores for it
* Drop old and new index
* Train-Test-Split
* Define Target y, and Features X
* Feature Engineering
* Dummy Creating
* Rescaling based on train, apply to test
* dropping other unused columns
* export X-test, X_train, y_test and y_train to a new notebook for modeling (find library to help us with that)
* individual modeling

## Splitting data for testing 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
#dropping Quakers column and unnamed
#changing one of the altitude to log and droping the original
X_train["altitude_mean_log"] = np.log(X_train["altitude_mean_meters"])
X_train.drop(['altitude_mean_meters'], axis=1, inplace=True)
X_train.drop(['Quakers'], axis=1, inplace=True)
X_train.drop(['Unnamed: 0'], axis=1, inplace=True)

In [None]:
X_train.info()

In [None]:
altitude_low_meters_mean = X_train["altitude_low_meters"].mean()
altitude_high_meters_mean = X_train["altitude_high_meters"].mean()
altitude_mean_log_mean = X_train["altitude_mean_log"].mean()

In [None]:
# fillna with mean.. 
X_train["altitude_low_meters"] = X_train["altitude_low_meters"].fillna(altitude_low_meters_mean)
X_train["altitude_high_meters"] = X_train["altitude_high_meters"].fillna(altitude_high_meters_mean)
X_train["altitude_mean_log"] = X_train["altitude_mean_log"].fillna(altitude_mean_log_mean)

In [None]:
print(f"altitude low meters mean is {altitude_low_meters_mean}")
print(f"altitude_high_meters_mean is {altitude_high_meters_mean}")
print(f"altitude_mean_log_mean is {altitude_mean_log_mean}")

## Trainining the model

In [None]:
## in order to exemplify how the predict will work.. we will save the y_train
X_test.to_csv("data/X_test.csv")
y_test.to_csv("data/y_test.csv")

In [None]:
#training the model
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error
y_train_pred = reg.predict(X_train)
mse = mean_squared_error(y_train, y_train_pred)
print(mse)

In [None]:
#dropping Quakers column and unnamed
#changing one of the altitude to log and droping the original
X_test["altitude_mean_log"] = np.log(X_test["altitude_mean_meters"])
X_test.drop(['altitude_mean_meters'], axis=1, inplace=True)
X_test.drop(['Quakers'], axis=1, inplace=True)
X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
# fillna with mean.. 
X_test["altitude_low_meters"] = X_test["altitude_low_meters"].fillna(altitude_low_meters_mean)
X_test["altitude_high_meters"] = X_test["altitude_high_meters"].fillna(altitude_high_meters_mean)
X_test["altitude_mean_log"] = X_test["altitude_mean_log"].fillna(altitude_mean_log_mean)

In [None]:
y_test_pred = reg.predict(X_test)
mse = mean_squared_error(y_test, y_test_pred)
print(mse)