In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Content
1. [Import all the libraries.](#libraries)
2. [EDA.](#EDA)
3. [A curious case of where does the money go?](#data)

<a id='libraries'></a>

# Importing Libraries and loading up the data

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df=pd.read_csv('/kaggle/input/plentina-challenge/transactions_train.csv')
df.head()

<a id='EDA'></a>

# EDA

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
skew =df.skew().sort_values(ascending =False )
skew_df= pd.DataFrame({'skew':skew})
skew_df.head(10)

In [None]:
var= df.var().sort_values(ascending =True )
var_df= pd.DataFrame({'var':var})
var_df.head(10)

In [None]:
sns.pairplot(df.iloc[0:50000], hue= 'isFraud')

In [None]:
sns.heatmap(df.corr(),annot=True)

In [None]:
sns.countplot(x='type', data=df)

In [None]:
df['isFraud'].value_counts().plot(kind='bar')

In [None]:
df['oldbalanceOrig'].plot()

In [None]:
df['amount'].plot()

In [None]:
df['newbalanceOrig'].plot()

In [None]:
df['oldbalanceDest'].plot()

In [None]:
df['newbalanceDest'].plot()

In [None]:
num_col=df.drop(['isFraud'], axis=1).select_dtypes(include=['int64','float64']).columns
cat_col= df.drop(['isFraud'], axis=1).select_dtypes(exclude=['int64','float64']).columns

In [None]:
v0 = sns.color_palette(palette='viridis').as_hex()[0]
fig = plt.figure(figsize=(18,6))
sns.boxplot(data=df[num_col], color=v0,saturation=.5);
plt.xticks(fontsize= 14)
plt.title('Box plot of train numerical columns', fontsize=16)

In [None]:
fig = plt.figure(figsize=(10,5))
sns.barplot(y=df[cat_col].nunique().values, x=df[cat_col].nunique().index, color='black', alpha=.5)
plt.xticks(rotation=0)
plt.title('Number of categorical unique values',fontsize=16);

In [None]:
labels = df['type'].astype('category').cat.categories.tolist()
counts = df['type'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.show()

**Target Analysis**
- we have a seriously imbalanced dataset. only less than 1% of the data is fraudulent and rest are not
- all the fraudulent transactions type are either cash_out or transfer, that's an interesting thing.

In [14]:
pct_fraud=(df['isFraud'].value_counts()[1]*100)/df.shape[0]
print(df['isFraud'].value_counts())
print(pct_fraud)

In [10]:
df['isFraud'].value_counts().plot(kind='bar')

In [7]:
df.loc[df['isFraud']==1].type.value_counts().plot(kind='bar')

In [8]:
df.loc[df['isFraud']==1].type.value_counts()

<a id='data'></a>

# A curious case of where does the money go?
i have observed few interesting things about the data, 
- everytime whenever an amount has been deducted  reciever(nameDest)'s old balance(oldbalanceDest) should be increased to        newbalanceDest=oldbalanceDest + amount.
  - conditions taken are -> (amount>0) and (newbalanceDest<oldbalanceDest) 
- but newbalanceDest is not increasing for 1236956 rows which is roughly 20% of the whole data among that 27 are fraudulent and rest are not. 96% of the transactions are done by cash_in
- i have digged a little bit deeper and made my edge cases more strong. this time not only amount has been deducted but also reduced from the sender.
   - conditions taken -> (amount>0) and (newbalanceOrig<oldbalanceOrig) and    
      (newbalanceDest<oldbalanceDest)
  
  this time i have got 21819 points and 27 are fraudulent and most of the transactions are done by cash_out
- conclusion
    - there are 1236956(20% of whole data) data points where amount has been deducted and reciever didn't recieve it.
      - among that 27 are fraud and majority of the transactions are done by cash_in
    - there are 21819(around 1% of whole data) data points where amount has been deducted and also deducted from sender account as well but reciever didn't recieve it.
      - among that 27 are fraud and majority of the transactions are done by cash_out

**where does the money go?**
       

**1st part of the observation**

In [5]:
um_df=df.loc[(df['amount']>0) & (df['newbalanceDest']<df['oldbalanceDest'])]
um_df

In [None]:
unmatched=df.loc[(df['amount']>0) & (df['newbalanceDest']<df['oldbalanceDest'])].shape[0]
total=df.shape[0]
pct=(unmatched*100)/total
pct

In [None]:
#fraud and no fraud in unmatched 
df.loc[(df['amount']>0) & (df['newbalanceDest']<df['oldbalanceDest'])]['isFraud'].value_counts()

In [None]:
labels = um_df['type'].astype('category').cat.categories.tolist()
counts = um_df['type'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.show()

In [None]:
um_df.type.value_counts().plot(kind='bar')

**2nd part of the observation**

In [None]:
um_df2=df.loc[(df['amount']>0) & (df['newbalanceOrig']<df['oldbalanceOrig']) & (df['newbalanceDest']<df['oldbalanceDest'])]
um_df2

In [None]:
unmatched2=um_df2.shape[0]
total=df.shape[0]
pct2=(unmatched2*100)/total
pct2

In [None]:
um_df2['type'].value_counts().plot(kind='bar')

In [None]:
um_df2['type'].value_counts().plot(kind='pie')