<a href="https://colab.research.google.com/github/AhmedAmine98/AhmedAmine98-Tunisian_Fraud_Detection.ipynb/blob/master/Ahmed_Amine_Fatnassi_notebok_Tunisian_Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **IMPORT DATA**

In [None]:
# import data from google drive :) 
 
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 160)  # to display all columns of our data 

import lightgbm as lgb                             # Modeling
from math import sqrt
from sklearn.metrics import make_scorer, mean_squared_error  #scoring
from sklearn.model_selection import train_test_split  
from sklearn.model_selection import cross_val_score

import warnings
warnings.simplefilter('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:

train = pd.read_csv('/content/SUPCOM_Train.csv')
test = pd.read_csv('/content/SUPCOM_Test.csv')
Submission = pd.read_csv('/content/SUPCOM_SampleSubmission.csv')

In [None]:
train.shape , test.shape

# **EDA**

In [None]:
# Look at some scatter plots (only plotting for a subset of data to keep things fast)
sample = train.sample(10000)
plt.scatter(sample['TVA_TOTDUE'], sample['target'], alpha=0.3 , )

In [None]:
plt.scatter(sample['CTR_MATFIS'], sample['target'], alpha=0.3)

In [None]:
# Plot trend year-on-year
train.groupby('EXE_EXERCI').mean().reset_index().plot(y='target', x='EXE_EXERCI', kind='bar')
                                                #ylim=(0, 0.03))

In [None]:
# Plot trend year-on-year
train.groupby('RES_ANNIMP').mean().reset_index().plot(y='target', x='RES_ANNIMP', kind='bar')
                                                #ylim=(0, 0.03))

In [None]:
# Let's plot precipitation and burn area on the same plot - note the inverse relationship, and the strong periodic component to both.
ax = train.groupby('RES_ANNIMP').mean().reset_index().plot(y='target', x='RES_ANNIMP')
train.groupby('RES_ANNIMP').mean().reset_index().plot(y='TVA_TOTDUE', x='RES_ANNIMP', ax=ax)

# **Feature  Engineering**

In [None]:
# Label encoder : 

from sklearn.preprocessing import LabelEncoder
train['CTR_CATEGO_X'] = LabelEncoder().fit_transform(train['CTR_CATEGO_X'])
test['CTR_CATEGO_X'] = LabelEncoder().fit_transform(test['CTR_CATEGO_X'])

In [None]:
cols_with_missing = [col for col in train 
                     if (train[col].isnull().sum()>17800)]

In [None]:
train.drop(cols_with_missing, axis=1, inplace=True)
test.drop(cols_with_missing, axis=1, inplace=True)

In [None]:
target = train.target  # TARGET 
testID = test.id       # GET  ID to create our submission correctly 

train.drop(['id','target'],1,inplace=True)
test.drop(['id',],1,inplace=True)           # delete id from test || target & id from train because we haven't target in test :) ->Same shape

In [None]:
train

**<h3> Fillna : fill Nans with Mean : Not the best Approach but try to get a methodic fillna :) </h3>**

In [None]:
To_convert_to_int = ['CTR_OFODEP','CTR_OFODET','CTR_OBLAUT','CTR_OBLASS','CTR_ODTIMB','CTR_OBLTCL','CTR_OBLTHO','CTR_OBLDLI','CTR_OBLTVI',]
train[To_convert_to_int] = train[To_convert_to_int].fillna(-1)
train[To_convert_to_int] = train[To_convert_to_int].astype('int16')

test[To_convert_to_int] = test[To_convert_to_int].fillna(-1)
test[To_convert_to_int] = test[To_convert_to_int].astype('int16')

train.fillna(train.mean(),inplace=True) # when you work with models like RandomForest assert that you haven't nans in your data 
test.fillna(test.mean(),inplace=True)

# **Modeling : Cross-Validation-LGBM + Prediction**



In [None]:
params ={'colsample_bytree':0.85,'learning_rate':0.03,'max_depth':8,'n_estimators':3000,'num_leaves':150,
                               'silent':False,'metric':'rmse','objective':'regression'}


In [None]:
n_iters = 7

preds_buf = []
err_buf = []
X = train
y=target

categ_features = ['BCT_CODBUR','CTR_MATFIS','FJU_CODFJU','CTR_CESSAT','ACT_CODACT','CTR_OBLDIR','CTR_OBLACP','CTR_OBLRES',
                  
                  'CTR_OBLFOP','CTR_OBLTFP','CTR_OBLDCO','CTR_OBLTVA','CTR_OFODEP','CTR_OFODET','CTR_OBLAUT','CTR_OBLASS',
                  
                  'CTR_ODTIMB','CTR_OBLTCL','CTR_OBLTHO','CTR_OBLDLI','CTR_OBLTVI',]


In [None]:
for i in range(n_iters): 
    x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=i)
    d_train = lgb.Dataset(x_train, label=y_train,categorical_feature=categ_features)
    d_valid = lgb.Dataset(x_valid, label=y_valid,categorical_feature=categ_features)

    model = lgb.train(params, d_train,valid_sets=(d_train,d_valid) ,early_stopping_rounds=100 ,verbose_eval=100)

    preds = np.clip(model.predict(x_valid,num_iteration=model.best_iteration) ,a_min=0 , a_max=100)
    
    err = sqrt(mean_squared_error(y_valid, preds))
    err_buf.append(err)
    print('RMSLE = ' + str(err))
    
    ######################################################   " TEST "  ######################################################"
    preds = np.clip(model.predict(test,num_iteration=model.best_iteration ), a_min=0 , a_max=100) 
    preds_buf.append(preds)

print('Mean RMSLE = ' + str(np.mean(err_buf)) + ' +/- ' + str(np.std(err_buf)))
# Average predictions
preds = np.mean(preds_buf, axis=0)

# **Create a submission**

In [None]:
Submission_Cross_val_LGB = pd.DataFrame()
Submission_Cross_val_LGB['client_id'] = testID
Submission_Cross_val_LGB['target'] = np.clip(preds ,a_min =0, a_max=100)


print('min' , Submission_Cross_val_LGB['target'].min() )
print('max' , Submission_Cross_val_LGB['target'].max() )
print('mean : ' ,Submission_Cross_val_LGB['target'].mean())

Submission_Cross_val_LGB.to_csv('Submission_Cross_val_LGB_over_17800.csv',index=False)