In [1]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)
bank_dataset = pd.read_csv('bank-data.csv')
print(bank_dataset.head())

   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  duration  campaign  pdays  previous     poutcome   y  
0   may         mon       261         1    999         0  nonexistent  no  
1   may         mon       149         1    999         0  nonexistent  no  
2   may         mon       226         1    999         0  nonexistent  no  
3   may         mon       151         1    999         0  nonexistent  no  
4   may         mon       307         1    999         0  nonexistent  no  


In [2]:
print(bank_dataset.shape)

(41188, 16)


In [3]:
columns = list(bank_dataset.columns)
print(columns)

['age', 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']


In [4]:
print("Column datatypes: ")
print(bank_dataset.dtypes)

Column datatypes: 
age             int64
job            object
marital        object
education      object
default        object
housing        object
loan           object
contact        object
month          object
day_of_week    object
duration        int64
campaign        int64
pdays           int64
previous        int64
poutcome       object
y              object
dtype: object


In [5]:
print(bank_dataset.describe()) 

               age      duration      campaign         pdays      previous
count  41188.00000  41188.000000  41188.000000  41188.000000  41188.000000
mean      40.02406    258.285010      2.567593    962.475454      0.172963
std       10.42125    259.279249      2.770014    186.910907      0.494901
min       17.00000      0.000000      1.000000      0.000000      0.000000
25%       32.00000    102.000000      1.000000    999.000000      0.000000
50%       38.00000    180.000000      2.000000    999.000000      0.000000
75%       47.00000    319.000000      3.000000    999.000000      0.000000
max       98.00000   4918.000000     56.000000    999.000000      7.000000


In [6]:
print(bank_dataset.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   age          41188 non-null  int64 
 1   job          41188 non-null  object
 2   marital      41188 non-null  object
 3   education    41188 non-null  object
 4   default      41188 non-null  object
 5   housing      41188 non-null  object
 6   loan         41188 non-null  object
 7   contact      41188 non-null  object
 8   month        41188 non-null  object
 9   day_of_week  41188 non-null  object
 10  duration     41188 non-null  int64 
 11  campaign     41188 non-null  int64 
 12  pdays        41188 non-null  int64 
 13  previous     41188 non-null  int64 
 14  poutcome     41188 non-null  object
 15  y            41188 non-null  object
dtypes: int64(5), object(11)
memory usage: 5.0+ MB
None


In [7]:
print(bank_dataset.isnull().sum()) 


age            0
job            0
marital        0
education      0
default        0
housing        0
loan           0
contact        0
month          0
day_of_week    0
duration       0
campaign       0
pdays          0
previous       0
poutcome       0
y              0
dtype: int64


In [8]:
def countUnknownValues(y):
    count = 0
    for x in bank_dataset[y]:
        if x == "unknown":
            count = count  + 1
    return count



print(f'Housing: {countUnknownValues("housing")}\nLoan: {countUnknownValues("loan")}\nJob: {countUnknownValues("job")}\nMarital: {countUnknownValues("marital")}\nEducation: {countUnknownValues("education")}\nDefault: {countUnknownValues("default")}\nContact: {countUnknownValues("contact")}\nMonth: {countUnknownValues("month")}\nDay_of_week: {countUnknownValues("day_of_week")}\nPoutcome: {countUnknownValues("poutcome")}\nY: {countUnknownValues("y")}')


Housing: 990
Loan: 990
Job: 330
Marital: 80
Education: 1731
Default: 8597
Contact: 0
Month: 0
Day_of_week: 0
Poutcome: 0
Y: 0


In [9]:
def replace_unknown_with_mode(column_name):
    mode_value = bank_dataset[column_name].mode()[0]
    bank_dataset.replace({column_name: "unknown"}, mode_value, inplace=True)


replace_unknown_with_mode("job")
replace_unknown_with_mode("marital")
replace_unknown_with_mode("education")
replace_unknown_with_mode("default")
replace_unknown_with_mode("housing")
replace_unknown_with_mode("loan")

In [10]:
def countUnknownValues(y):
    count = 0
    for x in bank_dataset[y]:
        if x == "unknown":
            count = count  + 1
    return count



print(f'Housing: {countUnknownValues("housing")}\nLoan: {countUnknownValues("loan")}\nJob: {countUnknownValues("job")}\nMarital: {countUnknownValues("marital")}\nEducation: {countUnknownValues("education")}\nDefault: {countUnknownValues("default")}\nContact: {countUnknownValues("contact")}\nMonth: {countUnknownValues("month")}\nDay_of_week: {countUnknownValues("day_of_week")}\nPoutcome: {countUnknownValues("poutcome")}\nY: {countUnknownValues("y")}')


Housing: 0
Loan: 0
Job: 0
Marital: 0
Education: 0
Default: 0
Contact: 0
Month: 0
Day_of_week: 0
Poutcome: 0
Y: 0


In [11]:

def replaceWithBinaryValues(column_name):
    bank_dataset.replace({column_name: {"yes": 1, "no": 0}}, inplace = True)

columns = ["housing", "loan", "default", "y"]

for x in columns:
    replaceWithBinaryValues(x)
    print(f'Replaced Successfully')

Replaced Successfully
Replaced Successfully
Replaced Successfully
Replaced Successfully


In [12]:
print(bank_dataset.head())

   age        job  marital    education default housing loan    contact month  \
0   56  housemaid  married     basic.4y       0       0    0  telephone   may   
1   57   services  married  high.school       0       0    0  telephone   may   
2   37   services  married  high.school       0       1    0  telephone   may   
3   40     admin.  married     basic.6y       0       0    0  telephone   may   
4   56   services  married  high.school       0       0    1  telephone   may   

  day_of_week  duration  campaign  pdays  previous     poutcome  y  
0         mon       261         1    999         0  nonexistent  0  
1         mon       149         1    999         0  nonexistent  0  
2         mon       226         1    999         0  nonexistent  0  
3         mon       151         1    999         0  nonexistent  0  
4         mon       307         1    999         0  nonexistent  0  


bank_dataset['education'] = bank_dataset['education'].replace({'primary': 0, 'secondary': 1, 'tertiary': 2, 'unknown': 3})

bank_dataset['marital'] = bank_dataset['marital'].replace({'married': 1, 'single': 0, "divorced": 2})

bank_dataset['month'] = bank_dataset['month'].replace({'mar': 3, 'apr': 4, '})
print(bank_dataset.head())


In [14]:
# def convertToCsv(dataset,filename):
#     dataset.to_csv(filename, header = True, index = False)

# convertToCsv(bank_dataset, "exp1.csv")

In [16]:
# #Checking for duplicates
new_df = bank_dataset
print(new_df.duplicated().any()) 
print(new_df.duplicated())
print(new_df.shape) 

True
0        False
1        False
2        False
3        False
4        False
         ...  
41183    False
41184    False
41185    False
41186    False
41187    False
Length: 41188, dtype: bool
(41188, 16)


In [20]:
for x in new_df.columns:
    print(f'Column name: {x} : {new_df[x].unique()}')
    print("\n")

Column name: age : [56 57 37 40 45 59 41 24 25 29 35 54 46 50 39 30 55 49 34 52 58 32 38 44
 42 60 53 47 51 48 33 31 43 36 28 27 26 22 23 20 21 61 19 18 70 66 76 67
 73 88 95 77 68 75 63 80 62 65 72 82 64 71 69 78 85 79 83 81 74 17 87 91
 86 98 94 84 92 89]


Column name: job : ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'entrepreneur' 'student']


Column name: marital : [1 0 2]


Column name: education : ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'university.degree' 'illiterate']


Column name: default : [0 1]


Column name: housing : [0 1]


Column name: loan : [0 1]


Column name: contact : ['telephone' 'cellular']


Column name: month : ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep']


Column name: day_of_week : ['mon' 'tue' 'wed' 'thu' 'fri']


Column name: duration : [ 261  149  226 ... 1246 1556 1868]


Column name: campaign : [ 1  2  3  4  5  6  7  8  9 10 11 12 13 19 