## Import Packages

In [14]:
# Import Libraries
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.impute import SimpleImputer

## Read Dataset

In [36]:
data=pd.read_csv('/content/Social Development Bank Loans For 2019.csv')
data.head()

Unnamed: 0,ID,bank branch,funding type,funding classification,customer sector,financing value,installment value,cashing date,sex,age,social status,special needs,number of family members,saving loan,income
0,1.0,Tabūk,social,family,government employee,60000.0,>= 1000,2019/02,MALE,>= 30,married,No,>= 05,No,< 5000
1,2.0,Hail,project,solution,,160000.0,>= 1000,2019/01,MALE,< 30,single,No,< 02,No,< 5000
2,3.0,Tabūk,social,marriage,government employee,60000.0,>= 1000,2019/02,MALE,< 30,married,No,>= 02,No,>= 7500
3,4.0,Medina,social,marriage,employee of a government company,60000.0,< 1000,2019/03,MALE,< 30,married,No,>= 10,No,>= 5000
4,5.0,Medina,social,family,private sector employee,60000.0,>= 1000,2019/02,FEMALE,>= 30,divorced,No,>= 02,No,>= 10000


In [37]:
print('Shape of dataset is ', data.shape)

Shape of dataset is  (11175, 15)


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11175 entries, 0 to 11174
Data columns (total 15 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   ID                        11175 non-null  float64
 1   bank branch               11175 non-null  object 
 2   funding type              11175 non-null  object 
 3   funding classification    11175 non-null  object 
 4   customer sector           7225 non-null   object 
 5   financing value           11175 non-null  float64
 6   installment value         11175 non-null  object 
 7   cashing date              11175 non-null  object 
 8   sex                       11175 non-null  object 
 9   age                       11169 non-null  object 
 10  social status             11175 non-null  object 
 11  special needs             11175 non-null  object 
 12  number of family members  11132 non-null  object 
 13  saving loan               11175 non-null  object 
 14  income

### Data Preprocessing

#### Handle missing values

In [39]:
#Checking for the null values in the dataset
data.isnull().sum()

ID                             0
bank branch                    0
funding type                   0
funding classification         0
customer sector             3950
financing value                0
installment value              0
cashing date                   0
sex                            0
age                            6
social status                  0
special needs                  0
number of family members      43
saving loan                    0
income                       114
dtype: int64

In [40]:
# Fill in the Missing Values using the Simple Imputer with the Most Frequent strategy
imputer = SimpleImputer(strategy='most_frequent', missing_values=np.nan)
imputer = imputer.fit(data[['customer sector', 'income', 'number of family members', 'age']])
data[['customer sector', 'income','number of family members', 'age']] = imputer.transform(
    data[['customer sector', 'income', 'number of family members', 'age']])

In [41]:
# Check if missing values is filling in each column 
print('Customer sector missing values after handling = ', data['customer sector'].isnull().sum())
print('Income missing values after handling = ', data['income'].isnull().sum())
print('Number of family members missing values after handling = ', data['number of family members'].isnull().sum())
print('Age missing values after handling = ', data['age'].isnull().sum())

Customer sector missing values after handling =  0
Income missing values after handling =  0
Number of family members missing values after handling =  0
Age missing values after handling =  0


#### Delete unimportant columns

In [42]:
# Delete the unneeded coulmns.
data.drop(['ID'], axis=1, inplace=True)
data.drop(['cashing date'], axis=1, inplace=True)
data.drop(['social status'], axis=1, inplace=True) 
data.drop(['special needs'], axis=1, inplace=True) 
data.head(5)

Unnamed: 0,bank branch,funding type,funding classification,customer sector,financing value,installment value,sex,age,number of family members,saving loan,income
0,Tabūk,social,family,government employee,60000.0,>= 1000,MALE,>= 30,>= 05,No,< 5000
1,Hail,project,solution,government employee,160000.0,>= 1000,MALE,< 30,< 02,No,< 5000
2,Tabūk,social,marriage,government employee,60000.0,>= 1000,MALE,< 30,>= 02,No,>= 7500
3,Medina,social,marriage,employee of a government company,60000.0,< 1000,MALE,< 30,>= 10,No,>= 5000
4,Medina,social,family,private sector employee,60000.0,>= 1000,FEMALE,>= 30,>= 02,No,>= 10000


## MapReduce Part

In [43]:
!pip install mrjob

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Convert the dataframe into text file

In [44]:
data.to_csv("/content/SocialBankLoans.txt", index = False)

### 1st MapReduce

#### We create a MapReduce class to count the occurrence of each loan type in order to find the most requested type from the social development bank loans.

#### Create a text file containing one column from the full data text

In [45]:
# funding Types text file to find the most requested types of loan 
data['funding type'].to_csv("/content/fundingType.txt", index = False)

#### Create the Class

In [48]:
%%file LoanTypesCount.py
# %%file is an Ipython magic function that saves the code cell as a file

from mrjob.job import MRJob # import the mrjob.job library
from mrjob.step import MRStep # import the mrjob.step library

class SA_LoanTypesCount(MRJob):
    def steps(self):
      return [
          MRStep(mapper=self.mapper_get_loanType,
                 reducer=self.reducer_count_loanTypes)
      ]
    

    def mapper_get_loanType(self, _, loan_type):
        # output each line as a tuple of (loan_type, 1) 
        yield (loan_type, 1)

    # the reduce step: combine all tuples with the same key. In this case, the key is the loan
    # then sum all the values of the tuple, which will give the most loann type requested
    def reducer_count_loanTypes(self, key, values):
        yield (key, sum(values))
        
if __name__ == "__main__":
  SA_LoanTypesCount.run()

Overwriting LoanTypesCount.py


In [49]:
!python LoanTypesCount.py fundingType.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/LoanTypesCount.root.20221130.110600.038980
Running step 1 of 1...
job output is in /tmp/LoanTypesCount.root.20221130.110600.038980/output
Streaming final output from /tmp/LoanTypesCount.root.20221130.110600.038980/output...
"funding type"	1
"project"	656
"social"	10384
"transfer"	135
Removing temp directory /tmp/LoanTypesCount.root.20221130.110600.038980...


**We can conclude from these results that the most requested loan type is the social type, with 10384 occurrences.**

### 2nd MapReduce

#### MapReduce is also used to find the specific loan type that is most commonly requested within social loans.

#### Create a text file containing one column from the full data text

In [50]:
# funding classification text file to find the exact classification of the most requested loan.  
data['funding classification'].to_csv("/content/fundingClassification.txt", index = False)

#### Create the Class

In [51]:
%%file ClassOfLoanCount.py
# %%file is an Ipython magic function that saves the code cell as a file

from mrjob.job import MRJob # import the mrjob.job library
from mrjob.step import MRStep # import the mrjob.step library

class SA_ClassOfLoanCount(MRJob):
    def steps(self):
      return [
          MRStep(mapper=self.mapper_get_ClassOfLoan,
                 reducer=self.reducer_count_ClassOfLoans)
      ]
    

    def mapper_get_ClassOfLoan(self, _, loan_class):
        # output each line as a tuple of (loan_class, 1) 
        yield (loan_class, 1)

    # the reduce step: combine all tuples with the same key. In this case, the key is the loanClass
    # then sum all the values of the tuple, which will give the most loan class requested
    def reducer_count_ClassOfLoans(self, key, values):
        yield (key, sum(values))
        
if __name__ == "__main__":
  SA_ClassOfLoanCount.run()

Writing ClassOfLoanCount.py


In [52]:
!python ClassOfLoanCount.py fundingClassification.txt

No configs found; falling back on auto-configuration
No configs specified for inline runner
Creating temp directory /tmp/ClassOfLoanCount.root.20221130.110623.485606
Running step 1 of 1...
job output is in /tmp/ClassOfLoanCount.root.20221130.110623.485606/output
Streaming final output from /tmp/ClassOfLoanCount.root.20221130.110623.485606/output...
"emerging"	458
"excellence"	6
"family"	3273
"private"	7
"renovation"	103
"solution"	161
"taxi cab"	135
"telecom"	4
"food trucks"	7
"fresh graduate"	19
"funding classification"	1
"invention"	1
"marriage"	7001
Removing temp directory /tmp/ClassOfLoanCount.root.20221130.110623.485606...


**Based on our results, marriage has been the most commonly requested classification for a social loan.**