In [4]:
# data reading, writing, exploration, cleaning
import pandas as pd
# array creation and operation
import numpy as np 
# plotting library
import matplotlib.pyplot as plt 
# data visualization library
import seaborn as sns
# remove warnings
import warnings
warnings.filterwarnings('ignore')

In [5]:
# loading the dataset
df = pd.read_csv("C:/Users/Dell/OneDrive/Desktop/Data Engineer Project/Project-1/payment_dataset.csv")
df

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,70b7e94ea46d3e8b5bc12a50186edaf0,1,credit_card,24,274840.0
1,859f516f2fc3f95772e63c5757ab0d5b,1,credit_card,24,609560.0
2,ff36cbc44b8f228e0449c92ef089c843,1,credit_card,24,756490.0
3,2b7dbe9be72b8f9733844c31055c0825,1,credit_card,24,345390.0
4,6ae2e8b8fac02522481d2a2f4ca4412c,1,credit_card,24,433430.0
...,...,...,...,...,...
103881,0406037ad97740d563a178ecc7a2075c,1,blipay,1,363310.0
103882,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47770.0
103883,28bbae6599b09d39ca406b747b6632b1,1,blipay,1,191580.0
103884,744bade1fcf9ff3f31d860ace076d422,2,credit_card,0,58690.0


In [6]:
# column names
df.columns

Index(['order_id', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103886 entries, 0 to 103885
Data columns (total 5 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   order_id              103886 non-null  object 
 1   payment_sequential    103886 non-null  int64  
 2   payment_type          103886 non-null  object 
 3   payment_installments  103886 non-null  int64  
 4   payment_value         103886 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 4.0+ MB


In [8]:
# change of datatype 
df['payment_value'] = df['payment_value'].astype(int)
df

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,70b7e94ea46d3e8b5bc12a50186edaf0,1,credit_card,24,274840
1,859f516f2fc3f95772e63c5757ab0d5b,1,credit_card,24,609560
2,ff36cbc44b8f228e0449c92ef089c843,1,credit_card,24,756490
3,2b7dbe9be72b8f9733844c31055c0825,1,credit_card,24,345390
4,6ae2e8b8fac02522481d2a2f4ca4412c,1,credit_card,24,433430
...,...,...,...,...,...
103881,0406037ad97740d563a178ecc7a2075c,1,blipay,1,363310
103882,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47770
103883,28bbae6599b09d39ca406b747b6632b1,1,blipay,1,191580
103884,744bade1fcf9ff3f31d860ace076d422,2,credit_card,0,58690


In [9]:
# stastical analysis of numerical columns 
df.describe()

Unnamed: 0,payment_sequential,payment_installments,payment_value
count,103886.0,103886.0,103886.0
mean,1.092679,2.853349,154100.4
std,0.706584,2.687051,217494.1
min,1.0,0.0,0.0
25%,1.0,1.0,56790.0
50%,1.0,1.0,100000.0
75%,1.0,4.0,171837.5
max,29.0,24.0,13664080.0


In [10]:
# checking the null values
df.isnull().sum()

order_id                0
payment_sequential      0
payment_type            0
payment_installments    0
payment_value           0
dtype: int64

In [11]:
df['payment_type'].value_counts()

credit_card    76795
blipay         19784
voucher         5775
debit_card      1529
not_defined        3
Name: payment_type, dtype: int64

In [12]:
df

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,70b7e94ea46d3e8b5bc12a50186edaf0,1,credit_card,24,274840
1,859f516f2fc3f95772e63c5757ab0d5b,1,credit_card,24,609560
2,ff36cbc44b8f228e0449c92ef089c843,1,credit_card,24,756490
3,2b7dbe9be72b8f9733844c31055c0825,1,credit_card,24,345390
4,6ae2e8b8fac02522481d2a2f4ca4412c,1,credit_card,24,433430
...,...,...,...,...,...
103881,0406037ad97740d563a178ecc7a2075c,1,blipay,1,363310
103882,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47770
103883,28bbae6599b09d39ca406b747b6632b1,1,blipay,1,191580
103884,744bade1fcf9ff3f31d860ace076d422,2,credit_card,0,58690


In [13]:
# Remove 'not_defined' entry
df = df[df['payment_type'] != 'not_defined']

In [14]:
df

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,70b7e94ea46d3e8b5bc12a50186edaf0,1,credit_card,24,274840
1,859f516f2fc3f95772e63c5757ab0d5b,1,credit_card,24,609560
2,ff36cbc44b8f228e0449c92ef089c843,1,credit_card,24,756490
3,2b7dbe9be72b8f9733844c31055c0825,1,credit_card,24,345390
4,6ae2e8b8fac02522481d2a2f4ca4412c,1,credit_card,24,433430
...,...,...,...,...,...
103881,0406037ad97740d563a178ecc7a2075c,1,blipay,1,363310
103882,32609bbb3dd69b3c066a6860554a77bf,1,credit_card,1,47770
103883,28bbae6599b09d39ca406b747b6632b1,1,blipay,1,191580
103884,744bade1fcf9ff3f31d860ace076d422,2,credit_card,0,58690


In [88]:
df['payment_type'].value_counts()

credit_card    76795
blipay         19784
voucher         5775
debit_card      1529
Name: payment_type, dtype: int64

In [15]:
df.columns

Index(['order_id', 'payment_sequential', 'payment_type',
       'payment_installments', 'payment_value'],
      dtype='object')

In [89]:
df.to_csv('C:/Users/Dell/OneDrive/Desktop/Data Engineer Project/Project-1/Original dataset/payment.csv', index=False)