In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_excel('data.xlsx', index = False)
data[:5]

Unnamed: 0,Unique Id,Client Name,Account ID,Legal Entity,Currency,Payment Type,Paid Amount,Payment Date,Payment Status,Pending Amount,Comments
0,1,J P Morgan,101008,CitiBank New York Branch,GBP,Receive,45678,2020-03-11,Fully paid,0,Transaction complete
1,2,J P Morgan,105049,CitiBank New York Branch,USD,Receive,765327,2020-03-12,Partially Paid,1008,Waiting for pending amount
2,3,DEUTSCHE BANK,105050,CitiBank HongKong,CAD,Receive,637229,2020-03-13,Partially Paid,34566,Waiting for pending amount
3,4,DEUTSCHE BANK,105058,CitiBank New York Branch,USD,Receive,22345566,2020-03-14,Partially Paid,1000000,Waiting for pending amount
4,5,DEUTSCHE BANK,105059,CitiBank New York Branch,EUR,Receive,553322,2020-03-15,Rejected,553322,Payment Failed


In [4]:
data['Account ID'].describe(), data.shape

(count        20.000000
 mean     104901.450000
 std         917.150247
 min      101008.000000
 25%      105060.500000
 50%      105117.500000
 75%      105128.250000
 max      105170.000000
 Name: Account ID, dtype: float64, (20, 11))

In [5]:
import os

location = '/home/aditya/Documents/Project1-CITI-INTERNSHIP/Data'
files_list = os.listdir(location)
DATASET = data.drop(columns=['Unique Id'], axis = 1)
# DATASET.rename(columns={'ClientÂ  Name' : 'ClientName'}, inplace = True)

print(DATASET.columns)

for file in files_list:
    name = os.path.join(location, file)
    df = pd.read_excel(name, index = False)
    DATASET = pd.concat([DATASET, df], axis = 0, ignore_index=True)


Index(['Client Name', 'Account ID', 'Legal Entity', 'Currency', 'Payment Type',
       'Paid Amount', 'Payment Date', 'Payment Status', 'Pending Amount',
       'Comments'],
      dtype='object')


In [6]:
print(DATASET['Payment Status'].unique(), DATASET['Comments'].unique())

['Fully paid' 'Partially Paid' 'Rejected' 'Processing'] ['Transaction complete' 'Waiting for pending amount' 'Payment Failed'
 'Payment is in progress']


## Preprocessing

#### 1) Removing redundant ACCOUNT IDs.
#### 2) Aligning the Payment Status with Paid Amount and Comments.

    I WHEN PAYMENT STATUS = Fully paid, COMMENTS = 'Transaction Complete' & PENDING AMOUNT = 0
    II WHEN PAYMENT STATUS = Partially Paid, COMMENTS = 'Waiting for pending amount' 
    III WHEN PAYMENT STATUS = Rejected, COMMENTS = 'Payment Failed' & PENDING AMOUNT = PAID AMOUNT
    IV WHEN PAYMENT STATUS = Processing, COMMENTS = 'TPayment is in progress' & PENDING AMOUNT = PAID AMOUNT
    
#### 3) Resetting the index

In [7]:
DATASET.drop_duplicates(['Account ID'], inplace=True)

# FIRST CASE

DATASET.loc[DATASET["Payment Status"] == 'Fully paid', 'Pending Amount'] = 0
DATASET.loc[DATASET["Payment Status"] == 'Fully paid', 'Comments'] = 'Transaction complete'

# SECOND CASE

DATASET.loc[DATASET["Payment Status"] == 'Partially Paid', 'Comments'] = 'Waiting for pending amount'

# THIRD CASE

DATASET.loc[DATASET["Payment Status"] == 'Rejected', 'Pending Amount'] = DATASET.loc[DATASET["Payment Status"] == 'Rejected', 'Paid Amount']
DATASET.loc[DATASET["Payment Status"] == 'Rejected', 'Comments'] = 'Payment Failed'

# FOURTH CASE

DATASET.loc[DATASET["Payment Status"] == 'Processing', 'Pending Amount'] = DATASET.loc[DATASET["Payment Status"] == 'Processing', 'Paid Amount']
DATASET.loc[DATASET["Payment Status"] == 'Processing', 'Comments'] = 'Payment is in progress'

DATASET.reset_index(drop=True, inplace=True)

### Processing the Date Column

###### Converting the DATETIME datatype into string for usage later.

In [9]:
from datetime import datetime

DATASET['Payment Date'][20:] = DATASET['Payment Date'][20:].apply(lambda data : datetime.strptime(str(data), "%Y/%m/%d"))
DATASET['Payment Date'] = DATASET['Payment Date'].apply(lambda data : str(data)[:11])

type(DATASET['Payment Date'][11])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


str

In [10]:
DATASET.to_csv('dataset.csv')

In [11]:
questions1 = pd.read_pickle('/home/aditya/Downloads/questions.pkl')
questions2 = pd.read_pickle('/home/aditya/Downloads/q3.pkl')
solutions1 = pd.read_pickle('/home/aditya/Downloads/solution.pkl')
solutions2 = pd.read_pickle('/home/aditya/Downloads/s3.pkl')

In [12]:
questions1 = questions1.tolist()
questions = questions1 + questions2

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
CV = CountVectorizer(ngram_range=(1,1))

vec = CV.fit_transform(questions)

CV.vocabulary_

vec.shape

(19000, 2000)

In [15]:
pd.to_pickle(vec, "/home/aditya/Documents/Project1-CITI-INTERNSHIP/vec.pkl")
pd.to_pickle(vec, "/home/aditya/Documents/Project1-CITI-INTERNSHIP/dict.pkl")

In [17]:
CV.vocabulary_

{'who': 1996,
 'is': 1959,
 'the': 1987,
 'client': 1937,
 'with': 1998,
 'account': 1922,
 'id': 1955,
 '101008': 13,
 '105049': 892,
 '105050': 893,
 '105058': 898,
 '105059': 899,
 '105061': 900,
 '105071': 902,
 '105101': 916,
 '105102': 917,
 '105115': 920,
 '105126': 926,
 '105120': 922,
 '105121': 923,
 '105122': 924,
 '105125': 925,
 '105135': 929,
 '105140': 931,
 '105146': 932,
 '105150': 934,
 '105170': 936,
 '103443': 567,
 '102387': 325,
 '104466': 761,
 '101686': 166,
 '104461': 760,
 '104768': 833,
 '103354': 554,
 '101680': 164,
 '101568': 142,
 '103705': 616,
 '102454': 350,
 '102803': 432,
 '101677': 162,
 '104133': 700,
 '101926': 218,
 '104007': 681,
 '101860': 207,
 '105095': 914,
 '102032': 240,
 '103723': 621,
 '102509': 362,
 '102437': 345,
 '103255': 536,
 '104564': 783,
 '102691': 409,
 '101602': 148,
 '101738': 183,
 '103699': 613,
 '101472': 116,
 '102956': 462,
 '104795': 839,
 '101726': 179,
 '101161': 43,
 '104424': 753,
 '101371': 90,
 '102428': 341,
 '1