In [1]:
import numpy as np
import pandas as pd

In [39]:
data = pd.read_excel('data.xlsx', index = False)
data[:5]

Unnamed: 0,Unique Id,Client Name,Account ID,Legal Entity,Currency,Payment Type,Paid Amount,Payment Date,Payment Status,Pending Amount,Comments
0,1,J P Morgan,101008,CitiBank New York Branch,GBP,Receive,45678,2020-03-11,Fully paid,0,Transaction complete
1,2,J P Morgan,105049,CitiBank New York Branch,USD,Receive,765327,2020-03-12,Partially Paid,1008,Waiting for pending amount
2,3,DEUTSCHE BANK,105050,CitiBank HongKong,CAD,Receive,637229,2020-03-13,Partially Paid,34566,Waiting for pending amount
3,4,DEUTSCHE BANK,105058,CitiBank New York Branch,USD,Receive,22345566,2020-03-14,Partially Paid,1000000,Waiting for pending amount
4,5,DEUTSCHE BANK,105059,CitiBank New York Branch,EUR,Receive,553322,2020-03-15,Rejected,553322,Payment Failed


In [40]:
data['Account ID'].describe(), data.shape

(count        20.000000
 mean     104901.450000
 std         917.150247
 min      101008.000000
 25%      105060.500000
 50%      105117.500000
 75%      105128.250000
 max      105170.000000
 Name: Account ID, dtype: float64, (20, 11))

In [41]:
import os

location = '/home/aditya/Documents/Project1-CITI-INTERNSHIP/Data'
files_list = os.listdir(location)
DATASET = data.drop(columns=['Unique Id'], axis = 1)
DATASET['Source'] = "0"
# DATASET.rename(columns={'ClientÂ  Name' : 'ClientName'}, inplace = True)

print(DATASET.columns)

for i, file in enumerate(files_list):
    name = os.path.join(location, file)
    df = pd.read_excel(name, index = False)
    df['Source'] = str(i+1)
    DATASET = pd.concat([DATASET, df], axis = 0, ignore_index=True)


Index(['Client Name', 'Account ID', 'Legal Entity', 'Currency', 'Payment Type',
       'Paid Amount', 'Payment Date', 'Payment Status', 'Pending Amount',
       'Comments', 'Source'],
      dtype='object')


In [42]:
print(DATASET['Payment Status'].unique(), DATASET['Comments'].unique())

['Fully paid' 'Partially Paid' 'Rejected' 'Processing'] ['Transaction complete' 'Waiting for pending amount' 'Payment Failed'
 'Payment is in progress']


## Preprocessing

#### 1) Removing redundant ACCOUNT IDs.
#### 2) Aligning the Payment Status with Paid Amount and Comments.

    I WHEN PAYMENT STATUS = Fully paid, COMMENTS = 'Transaction Complete' & PENDING AMOUNT = 0
    II WHEN PAYMENT STATUS = Partially Paid, COMMENTS = 'Waiting for pending amount' 
    III WHEN PAYMENT STATUS = Rejected, COMMENTS = 'Payment Failed' & PENDING AMOUNT = PAID AMOUNT
    IV WHEN PAYMENT STATUS = Processing, COMMENTS = 'TPayment is in progress' & PENDING AMOUNT = PAID AMOUNT
    
#### 3) Resetting the index

In [43]:
DATASET.drop_duplicates(['Account ID'], inplace=True)

# FIRST CASE

DATASET.loc[DATASET["Payment Status"] == 'Fully paid', 'Pending Amount'] = 0
DATASET.loc[DATASET["Payment Status"] == 'Fully paid', 'Comments'] = 'Transaction complete'

# SECOND CASE

DATASET.loc[DATASET["Payment Status"] == 'Partially Paid', 'Comments'] = 'Waiting for pending amount'

# THIRD CASE

DATASET.loc[DATASET["Payment Status"] == 'Rejected', 'Pending Amount'] = DATASET.loc[DATASET["Payment Status"] == 'Rejected', 'Paid Amount']
DATASET.loc[DATASET["Payment Status"] == 'Rejected', 'Comments'] = 'Payment Failed'

# FOURTH CASE

DATASET.loc[DATASET["Payment Status"] == 'Processing', 'Pending Amount'] = DATASET.loc[DATASET["Payment Status"] == 'Processing', 'Paid Amount']
DATASET.loc[DATASET["Payment Status"] == 'Processing', 'Comments'] = 'Payment is in progress'

DATASET.reset_index(drop=True, inplace=True)

### Processing the Date Column

###### Converting the DATETIME datatype into string for usage later.

In [44]:
from datetime import datetime

DATASET['Payment Date'][20:] = DATASET['Payment Date'][20:].apply(lambda data : datetime.strptime(str(data), "%Y/%m/%d"))
DATASET['Payment Date'] = DATASET['Payment Date'].apply(lambda data : str(data)[:11])
DATASET['Account ID'] = DATASET['Account ID'].apply(lambda data : str(data)[:11])

type(DATASET['Payment Date'][11])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


str

In [45]:
type(DATASET['Source'][24])

str

In [46]:
DATASET.to_csv('dataset.csv')

In [47]:
print(DATASET["Client Name"].unique())
print(DATASET["Legal Entity"].unique())
print(DATASET["Currency"].unique())
print(DATASET["Payment Type"].unique())
print(DATASET["Payment Status"].unique())
print(DATASET["Comments"].unique())

['J P Morgan' 'DEUTSCHE BANK' 'BNY Mellon' 'HSBC' 'Goldman Sachs'
 'Morgan Stanley' 'Wells Fargo' 'BNP Paribas']
['CitiBank New York Branch' 'CitiBank HongKong' 'CitiBank London'
 'CitiBank Singapore' 'CitiBank Bangaluru' 'CitiBank Pune']
['GBP' 'USD' 'CAD' 'EUR' 'INR']
['Receive' 'Deliver']
['Fully paid' 'Partially Paid' 'Rejected' 'Processing']
['Transaction complete' 'Waiting for pending amount' 'Payment Failed'
 'Payment is in progress']


In [48]:
type(DATASET['Account ID'][0])

str

In [49]:
DATASET[20:50]

Unnamed: 0,Client Name,Account ID,Legal Entity,Currency,Payment Type,Paid Amount,Payment Date,Payment Status,Pending Amount,Comments,Source
20,HSBC,103560,CitiBank Bangaluru,INR,Receive,2421,2020-02-23,Fully paid,0,Transaction complete,1
21,J P Morgan,104644,CitiBank Pune,INR,Receive,6879,2020-02-26,Processing,6879,Payment is in progress,1
22,Goldman Sachs,104136,CitiBank London,CAD,Deliver,6633,2020-02-24,Partially Paid,5987,Waiting for pending amount,1
23,Morgan Stanley,103256,CitiBank Bangaluru,USD,Receive,7332,2020-04-28,Processing,7332,Payment is in progress,1
24,Wells Fargo,104112,CitiBank Bangaluru,USD,Deliver,484,2020-04-20,Partially Paid,6030,Waiting for pending amount,1
25,Morgan Stanley,101369,CitiBank New York Branch,GBP,Receive,7332,2020-04-17,Fully paid,0,Transaction complete,1
26,BNY Mellon,102729,CitiBank Pune,GBP,Deliver,1667,2020-04-07,Partially Paid,9533,Waiting for pending amount,1
27,Goldman Sachs,103197,CitiBank London,EUR,Receive,622,2020-05-21,Fully paid,0,Transaction complete,1
28,Goldman Sachs,104425,CitiBank Singapore,EUR,Deliver,1778,2020-04-11,Fully paid,0,Transaction complete,1
29,Wells Fargo,103600,CitiBank London,USD,Deliver,5374,2020-05-05,Fully paid,0,Transaction complete,1


In [11]:
questions1 = pd.read_pickle('/home/aditya/Downloads/questions.pkl')
questions2 = pd.read_pickle('/home/aditya/Downloads/q3.pkl')
solutions1 = pd.read_pickle('/home/aditya/Downloads/solution.pkl')
solutions2 = pd.read_pickle('/home/aditya/Downloads/s3.pkl')

In [12]:
questions1 = questions1.tolist()
questions = questions1 + questions2

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer(ngram_range=(1,1))

vec = CV.fit_transform(questions)

CV.vocabulary_

vec.shape

(19000, 2000)

In [15]:
pd.to_pickle(vec, "/home/aditya/Documents/Project1-CITI-INTERNSHIP/vec.pkl")
pd.to_pickle(vec, "/home/aditya/Documents/Project1-CITI-INTERNSHIP/dict.pkl")