# GRANT ACQUISITION ANALYSIS

## The value propositions of the grant model are:
1. To increase research revenue in Lagos Business School through grant acquisition
2. To reduce the risk of grant application failure and minimise effort of application.

In [358]:
import pandas as pd

In [359]:
grant = pd.read_csv("LBS_Grant.csv")

In [360]:
grant

Unnamed: 0,GrantID,Grantor,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat,Grant
0,1,Sunref Nigeria & Access bank,energy,2022,312623.2,3.0,partnership,1
1,2,Ford Foundation and Innocent Chukwuma Social I...,sustainability,2021,1000000.12,5.0,long-term,1
2,3,Ford Foundation,sustainability,2017,25400.45,5.0,long-term,1
3,4,Templeton Charity Foundation,virtual reality,2021,234000.12,5.0,long-term,1
4,5,Meta,virtual reality,2022,500000.45,5.0,long-term,1
5,6,Bank of Industry,entrepreneurship,2022,1184834.23,5.0,partnership,1
6,7,Aspire Coronation Trust Foundation,sustainability,2021,71000.0,5.0,partnership,1
7,8,BRAC University,digital financial inclusion,2022,,1.0,partnership,1
8,9,SAMS,academia,2022,,1.0,partnership,1
9,10,CIPE,business ethics,2021,,1.0,partnership,1


In [362]:
grant.head(5)

Unnamed: 0,GrantID,Grantor,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat,Grant
0,1,Sunref Nigeria & Access bank,energy,2022,312623.2,3.0,partnership,1
1,2,Ford Foundation and Innocent Chukwuma Social I...,sustainability,2021,1000000.12,5.0,long-term,1
2,3,Ford Foundation,sustainability,2017,25400.45,5.0,long-term,1
3,4,Templeton Charity Foundation,virtual reality,2021,234000.12,5.0,long-term,1
4,5,Meta,virtual reality,2022,500000.45,5.0,long-term,1


In [363]:
grant.shape

(51, 8)

In [364]:
grant.columns #to check the columns or features

Index(['GrantID', 'Grantor', 'Rsch_Area', 'App_yr', 'Grant_Amt ($)',
       'Project_Tl', 'Fund_Cat', 'Grant'],
      dtype='object')

In [366]:
grant.isnull().sum() #to check for missing values

GrantID           0
Grantor           0
Rsch_Area         0
App_yr            0
Grant_Amt ($)    14
Project_Tl        8
Fund_Cat          0
Grant             0
dtype: int64

## Data Preprocessing

In [367]:
grant.isnull().sum() #to check for missing values

GrantID           0
Grantor           0
Rsch_Area         0
App_yr            0
Grant_Amt ($)    14
Project_Tl        8
Fund_Cat          0
Grant             0
dtype: int64

### Handling missing values by replacement with means

In [368]:
grant['Project_Tl'].fillna(value = grant['Project_Tl'].mean()) # For project timeline column

0      3.000000
1      5.000000
2      5.000000
3      5.000000
4      5.000000
5      5.000000
6      5.000000
7      1.000000
8      1.000000
9      1.000000
10     1.000000
11     1.000000
12     1.000000
13     1.000000
14    10.000000
15     6.000000
16     1.000000
17     1.000000
18     2.000000
19     1.000000
20     2.000000
21     2.000000
22     1.000000
23     1.000000
24     1.000000
25     1.000000
26     5.000000
27     1.000000
28     1.000000
29     3.000000
30     1.000000
31     1.000000
32     1.000000
33     3.000000
34     2.000000
35     5.000000
36     5.000000
37     1.000000
38     2.000000
39     2.000000
40     3.000000
41     3.000000
42     1.000000
43     2.534884
44     2.534884
45     2.534884
46     2.534884
47     2.534884
48     2.534884
49     2.534884
50     2.534884
Name: Project_Tl, dtype: float64

### Handling missing values by dropping - "Grant Amount" missing values

In [369]:
grant.dropna(axis =0, how="any", inplace=True)
grant.isna().sum()

GrantID          0
Grantor          0
Rsch_Area        0
App_yr           0
Grant_Amt ($)    0
Project_Tl       0
Fund_Cat         0
Grant            0
dtype: int64

# Features Engineering

In [401]:
grant_features = grant[['Rsch_Area', 'App_yr', 'Grant_Amt ($)','Project_Tl','Fund_Cat']]
grant_features

Unnamed: 0,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat
0,energy,2022,312623.2,3.0,partnership
1,sustainability,2021,1000000.12,5.0,long-term
2,sustainability,2017,25400.45,5.0,long-term
3,virtual reality,2021,234000.12,5.0,long-term
4,virtual reality,2022,500000.45,5.0,long-term
5,entrepreneurship,2022,1184834.23,5.0,partnership
6,sustainability,2021,71000.0,5.0,partnership
10,women empowerment,2022,40199.12,1.0,contract
12,women empowerment,2022,54471.15,1.0,contract
13,digital financial inclusion,2022,25764.56,1.0,contract


**Features Definitions**
 - Rsch_Area = Research Area: subject of interest
     - The grantors determine the subject matter such as women empowerment, sustainability, etc.
 - App_yr = Application year: when the grant was secured.
 - Grant_Amt  = Grant Amount secured  in DOLLAR.
 - Project_Tl = Project timeline: time given to complete grant research.
 - Fund_Cat = Funding Category: category of grant: short-term, long-term, partnership or contract.

In [390]:
from sklearn.preprocessing import LabelEncoder # importing the Scikit-learn package

In [391]:
LabelEncoder

sklearn.preprocessing._label.LabelEncoder

In [392]:
le = LabelEncoder() #initializing LabelEncoder

In [402]:
grant_features.columns

Index(['Rsch_Area', 'App_yr', 'Grant_Amt ($)', 'Project_Tl', 'Fund_Cat'], dtype='object')

### Converting some features to numerical variables

In [410]:
le.fit_transform(grant_features['Rsch_Area']) # for research area
grant_features['Rsch_Area'] = le.fit_transform(grant_features['Rsch_Area'])
grant_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grant_features['Rsch_Area'] = le.fit_transform(grant_features['Rsch_Area'])


Unnamed: 0,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat
0,8,6,312623.2,3.0,partnership
1,11,5,1000000.12,5.0,long-term
2,11,1,25400.45,5.0,long-term
3,12,5,234000.12,5.0,long-term
4,12,6,500000.45,5.0,long-term
5,9,6,1184834.23,5.0,partnership
6,11,5,71000.0,5.0,partnership
10,13,6,40199.12,1.0,contract
12,13,6,54471.15,1.0,contract
13,6,6,25764.56,1.0,contract


In [411]:
grant_features['Rsch_Area'].value_counts()

8     8
11    6
5     4
13    3
12    2
9     2
6     2
10    2
2     2
1     2
3     1
0     1
7     1
4     1
Name: Rsch_Area, dtype: int64

In [412]:
grant_features['App_yr'].value_counts()

5    11
6     8
1     7
4     5
0     3
2     2
3     1
Name: App_yr, dtype: int64

In [413]:
grant_features['Project_Tl'].value_counts()

1.0     16
5.0      8
2.0      6
3.0      5
10.0     1
6.0      1
Name: Project_Tl, dtype: int64

In [414]:
grant_features['Fund_Cat'].value_counts()

long-term      16
contract        9
short-term      8
partnership     4
Name: Fund_Cat, dtype: int64

In [415]:
grant_features['App_yr'] = le.fit_transform(grant_features['App_yr'])
grant_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grant_features['App_yr'] = le.fit_transform(grant_features['App_yr'])


Unnamed: 0,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat
0,8,6,312623.2,3.0,partnership
1,11,5,1000000.12,5.0,long-term
2,11,1,25400.45,5.0,long-term
3,12,5,234000.12,5.0,long-term
4,12,6,500000.45,5.0,long-term
5,9,6,1184834.23,5.0,partnership
6,11,5,71000.0,5.0,partnership
10,13,6,40199.12,1.0,contract
12,13,6,54471.15,1.0,contract
13,6,6,25764.56,1.0,contract


In [416]:
grant_features['Fund_Cat'] = le.fit_transform(grant_features['Fund_Cat'])
grant_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grant_features['Fund_Cat'] = le.fit_transform(grant_features['Fund_Cat'])


Unnamed: 0,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat
0,8,6,312623.2,3.0,2
1,11,5,1000000.12,5.0,1
2,11,1,25400.45,5.0,1
3,12,5,234000.12,5.0,1
4,12,6,500000.45,5.0,1
5,9,6,1184834.23,5.0,2
6,11,5,71000.0,5.0,2
10,13,6,40199.12,1.0,0
12,13,6,54471.15,1.0,0
13,6,6,25764.56,1.0,0


--- the target variable (grant) recorded as coded: 1 for yes and 0 for no ---

In [417]:
grant_target = grant["Grant"]
grant_target

0     1
1     1
2     1
3     1
4     1
5     1
6     1
10    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    0
34    0
36    0
38    0
39    1
40    1
41    1
42    1
Name: Grant, dtype: int64

### BUILDING MODEL: Splitting the dataset into test data and train data for building model

In [418]:
from sklearn.model_selection import train_test_split

In [420]:
grant_features # to reconfirm the features (train data)

Unnamed: 0,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat
0,8,6,312623.2,3.0,2
1,11,5,1000000.12,5.0,1
2,11,1,25400.45,5.0,1
3,12,5,234000.12,5.0,1
4,12,6,500000.45,5.0,1
5,9,6,1184834.23,5.0,2
6,11,5,71000.0,5.0,2
10,13,6,40199.12,1.0,0
12,13,6,54471.15,1.0,0
13,6,6,25764.56,1.0,0


In [421]:
grant_target # to confirm the target variable (test data)

0     1
1     1
2     1
3     1
4     1
5     1
6     1
10    1
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
24    1
25    1
26    1
27    1
28    1
29    1
30    1
31    1
32    1
33    0
34    0
36    0
38    0
39    1
40    1
41    1
42    1
Name: Grant, dtype: int64

In [432]:
feature_train, feature_test, target_train, target_test = train_test_split(grant_features, grant_target, test_size=0.2, random_state=0)  #assigning train and test data to each variable, also fixing the random state to zero to avoid reshuffling of accuracy score.

In [433]:
target_train

2     1
29    1
34    0
38    0
42    1
40    1
33    0
39    1
12    1
17    1
5     1
21    1
18    1
10    1
31    1
1     1
16    1
30    1
28    1
6     1
27    1
4     1
22    1
25    1
23    1
13    1
41    1
3     1
0     1
Name: Grant, dtype: int64

### Using Support Vector Machine (SVM) for model binary classification

In [443]:
from sklearn.svm import SVC

In [444]:
classifier = SVC()

In [445]:
feature_train

Unnamed: 0,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat
2,11,1,25400.45,5.0,1
29,0,3,4915.01,3.0,2
34,8,4,93204.21,2.0,1
38,11,4,45000.23,2.0,3
42,4,1,9657.11,1.0,3
40,13,1,8170.56,3.0,1
33,8,4,1142.1,3.0,1
39,9,4,22880.44,2.0,3
12,13,6,54471.15,1.0,0
17,8,5,10977.37,1.0,0


In [446]:
target_train

2     1
29    1
34    0
38    0
42    1
40    1
33    0
39    1
12    1
17    1
5     1
21    1
18    1
10    1
31    1
1     1
16    1
30    1
28    1
6     1
27    1
4     1
22    1
25    1
23    1
13    1
41    1
3     1
0     1
Name: Grant, dtype: int64

In [447]:
model = classifier.fit(feature_train,target_train) # fitting the model

In [448]:
prediction = model.predict(feature_test) # making prediction

In [449]:
from sklearn.metrics import accuracy_score # using ACCURACY SCORE METRIC

In [450]:
accuracy_score(target_test, prediction)

0.875

### To further check the model performance, the confusion matrix can be used.

In [453]:
from sklearn.metrics import confusion_matrix, classification_report

In [454]:
print(confusion_matrix(target_test, prediction))

[[0 1]
 [0 7]]


#### Interpreting the _confusion matrix_ above:
1. The model classified NO likely as likely
2. classified 7 Unlikely as Unlikely
3. classified 1 Likely as Unlikely - type II error (false -ve)
4. classified No unlikely as likely - type I error (false +ve)

__this shows a poor prediction power__ with _only 7 correct predictions, inspite of an accuracy of 87.5%_
**Optionally, the model can be improved by dimensionality reduction or hyperparametrization!**

In [456]:
print(classification_report(target_test, prediction)) # precision report

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.88      1.00      0.93         7

    accuracy                           0.88         8
   macro avg       0.44      0.50      0.47         8
weighted avg       0.77      0.88      0.82         8



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Hyperparameterization

In [490]:
classifier = SVC()

In [491]:
classifier = SVC(kernel ='poly') #polynomial kernel

In [492]:
model = classifier.fit(feature_train,target_train)

In [493]:
prediction = model.predict(feature_test)

In [494]:
accuracy_score(target_test, prediction)

0.875

**The model score is still 87.5%. This could be due to the randomization of my data proportions to 0, during data splitting.**

## ALTENATIVELY: BUILDING A REGRESSION MODEL

In [495]:
grant_features

Unnamed: 0,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat
0,8,6,312623.2,3.0,2
1,11,5,1000000.12,5.0,1
2,11,1,25400.45,5.0,1
3,12,5,234000.12,5.0,1
4,12,6,500000.45,5.0,1
5,9,6,1184834.23,5.0,2
6,11,5,71000.0,5.0,2
10,13,6,40199.12,1.0,0
12,13,6,54471.15,1.0,0
13,6,6,25764.56,1.0,0


In [496]:
x = grant_features # assigned the selected features to x, without the target variable, 'Grant'

In [497]:
x

Unnamed: 0,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat
0,8,6,312623.2,3.0,2
1,11,5,1000000.12,5.0,1
2,11,1,25400.45,5.0,1
3,12,5,234000.12,5.0,1
4,12,6,500000.45,5.0,1
5,9,6,1184834.23,5.0,2
6,11,5,71000.0,5.0,2
10,13,6,40199.12,1.0,0
12,13,6,54471.15,1.0,0
13,6,6,25764.56,1.0,0


In [498]:
y = grant_features["Rsch_Area"] # indexing research areas

**In research, it is believed that the research areas you are exploring determine your grant success**

In [499]:
y

0      8
1     11
2     11
3     12
4     12
5      9
6     11
10    13
12    13
13     6
14    10
15     6
16     8
17     8
18     8
19     8
20     8
21    10
22     5
23     5
24     5
25     5
26    11
27     2
28     3
29     0
30     1
31     2
32     1
33     8
34     8
36     7
38    11
39     9
40    13
41    11
42     4
Name: Rsch_Area, dtype: int64

In [500]:
x.corr() #checking for the assumption of multicollinearity via correlation

Unnamed: 0,Rsch_Area,App_yr,Grant_Amt ($),Project_Tl,Fund_Cat
Rsch_Area,1.0,0.337713,0.003524,0.420169,-0.450155
App_yr,0.337713,1.0,-0.174092,-0.136583,-0.535625
Grant_Amt ($),0.003524,-0.174092,1.0,0.442515,-0.039012
Project_Tl,0.420169,-0.136583,0.442515,1.0,-0.042527
Fund_Cat,-0.450155,-0.535625,-0.039012,-0.042527,1.0


_from the correlation matrix, Research area is positively correlated with most of other independent variables_

### Building the model

In [501]:
from sklearn.model_selection import train_test_split

In [502]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [503]:
from sklearn.linear_model import LinearRegression #importing regression package

In [504]:
regressor = LinearRegression()

In [505]:
model = regressor.fit(x_train, y_train) # training the model

In [507]:
prediction = model.predict(x_test) # making prediction

In [508]:
Dictionary = {"Actual outcome": y_test, "Predicted outcome": prediction, "Error": y_test-prediction}

In [509]:
Dictionary

{'Actual outcome': 26    11
 24     5
 20     8
 14    10
 36     7
 32     1
 19     8
 15     6
 Name: Rsch_Area, dtype: int64,
 'Predicted outcome': array([11.        ,  5.        ,  8.        ,  9.99999999,  7.        ,
         1.        ,  8.        ,  5.99999995]),
 'Error': 26   -1.062238e-09
 24   -1.099976e-09
 20   -1.050353e-09
 14    8.395004e-09
 36   -7.363568e-10
 32   -1.135782e-09
 19   -1.123151e-09
 15    5.441908e-08
 Name: Rsch_Area, dtype: float64}

In [510]:
pd.DataFrame(Dictionary) #adding the Dictionary of predicted and actual values into DataFrame

Unnamed: 0,Actual outcome,Predicted outcome,Error
26,11,11.0,-1.062238e-09
24,5,5.0,-1.099976e-09
20,8,8.0,-1.050353e-09
14,10,10.0,8.395004e-09
36,7,7.0,-7.363568e-10
32,1,1.0,-1.135782e-09
19,8,8.0,-1.123151e-09
15,6,6.0,5.441908e-08


### Evaluating the regression model performance

In [513]:
from sklearn import metrics

In [514]:
pd.DataFrame(Dictionary) # Adding the prediction and actual outcomes into DataFrame

Unnamed: 0,Actual outcome,Predicted outcome,Error
26,11,11.0,-1.062238e-09
24,5,5.0,-1.099976e-09
20,8,8.0,-1.050353e-09
14,10,10.0,8.395004e-09
36,7,7.0,-7.363568e-10
32,1,1.0,-1.135782e-09
19,8,8.0,-1.123151e-09
15,6,6.0,5.441908e-08


In [515]:
metrics.r2_score(y_test, prediction)

1.0

**This shows a perfect correlation between the dependent variable (Grant) and the independent variable (Research Area).**

_It is to be noted that correlation does not imply causation._

### SUMMARY

- Both the Classification and Regression model can be used to predict the outcomes of grant application
     - To increase research revenue in Lagos Business School, &
     - To save application time and minimize risk of grant failure or rejection

**Chinonso Amamchukwu completed and submitted this end-to-end project on 22-Dec-2022**