In [10]:
%run  Data_processing.ipynb
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from scipy import stats

In [11]:
# Show final data
final.head()

Unnamed: 0,Country,Code,ATMs per 1000 Adults,Starbucks Locations,GEI Score
0,Argentina,AR,58.841633,108.0,26.0
1,Australia,AU,160.13778,22.0,73.1
2,Austria,AT,119.094724,18.0,64.9
3,Azerbaijan,AZ,35.082586,4.0,32.1
4,Belgium,BE,93.653161,19.0,62.2


# 1- Linear Regression

In [12]:
# Subset data into features and labels 
x = final[["ATMs per 1000 Adults","Starbucks Locations"]]
y = final[["GEI Score"]]

# Standardizing the variables
stdsc = StandardScaler()
x_std = stdsc.fit_transform(x)
y_std = stdsc.fit_transform(y)

model = linear_model.LinearRegression(fit_intercept=True)
model.fit(x_std, y_std)

print("R^2 is", round(model.score(x_std, y_std)*100), "%")

R^2 is 22.0 %


R2 is a percentage of how close the data are to be fitted on a regression line so 22% is clearly a low score. However, this is can tell use that the data is non-linear and so the variables need to be adjusted or a new nonlinear algorithm needs to be used.

GEI scores are calculated by combining a chunk of elements. One of the elements is people's attitudes and perception towards entrepreneurship. In fact, human psychology is always very hard to predict and usually always has a low R2. However, this can be fixed in the other mode

In [13]:
# Renaming coloumns
x_std = pd.DataFrame(x_std)
x_std.columns = ["ATMs per 1000 Adults","Starbucks Locations"]

# OLS Regression Results using Scipy
res = sm.OLS(y_std, x_std).fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.223
Model:                            OLS   Adj. R-squared (uncentered):              0.188
Method:                 Least Squares   F-statistic:                              6.450
Date:                Wed, 14 Oct 2020   Prob (F-statistic):                     0.00344
Time:                        09:10:39   Log-Likelihood:                         -60.767
No. Observations:                  47   AIC:                                      125.5
Df Residuals:                      45   BIC:                                      129.2
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                           coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------

# Classification Model's outline
    Transform target variable into a binary outcome
    Split data into a training and a testing set
    Train model and evaluate score

Transform data for binary classification

In [14]:
# Convert the GEI Score into a binary output
final = final.sort_values(["GEI Score"], ascending=False)
final = final.reset_index(drop=True)

# Any country with GEI score above 42.2 is considered a potential country
final['GEI Score'] = final['GEI Score'].apply(lambda x: 1 if x >= 42.2 else 0)
final['GEI Score'] = final["GEI Score"].astype(bool)
final['GEI Score'] = final["GEI Score"].astype(int)

In [15]:
final.head()

Unnamed: 0,Country,Code,ATMs per 1000 Adults,Starbucks Locations,GEI Score
0,Switzerland,CH,97.611221,61.0,1
1,Canada,CA,221.126457,1468.0,1
2,Denmark,DK,54.014077,21.0,1
3,Australia,AU,160.13778,22.0,1
4,Netherlands,NL,53.492292,59.0,1


# Split data into testing and training sets

In [16]:
from sklearn.model_selection import train_test_split
X = final[["ATMs per 1000 Adults","Starbucks Locations"]]
y = final["GEI Score"]

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.33, 
                                                    random_state=42)

# Logistic Regression


In [17]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

print("Logistic Regression's accuracy is", round(logreg.score(X_test,y_test)*100),"%")

Logistic Regression's accuracy is 62.0 %


# What does that mean?

The logistic regression model was able to classify the test data correctly 44% of the time. Although this score will have       to be compared with the other models, its still considered a low score for a supervised learning algorithm.

More adjustments would need to be made to the model, for example, removing outliers, feature scaling or normalization.

# Naive Bayes

In [18]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

print("Naive Baye's Accuracy is", round(gnb.score(X_test, y_test) * 100),"%")

Naive Baye's Accuracy is 69.0 %


# Random Forest 

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs = 2)
rf.fit(X_train, y_train)

print("Random Forests's Accuracy", round(rf.score(X_test,y_test) * 100), "%")

Random Forests's Accuracy 62.0 %


# Support Vector Machines

In [20]:
from sklearn.svm import SVC, LinearSVC                           
linear_svc = LinearSVC()
linear_svc.fit(X_train, y_train)

print("SVM's Accuracy", round(linear_svc.score(X_test,y_test) * 100, 2), "%")

SVM's Accuracy 43.75 %




In [22]:
total = X_train.isnull().sum().sort_values(ascending=False)
percent_1 = X_train.isnull().sum()/X_train.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head()

Unnamed: 0,Total,%
Starbucks Locations,0,0.0
ATMs per 1000 Adults,0,0.0


This indicates there is no missing data.

In [23]:
total = X_test.isnull().sum().sort_values(ascending=False)
percent_1 = X_test.isnull().sum()/X_test.isnull().count()*100
percent_2 = (round(percent_1, 1)).sort_values(ascending=False)
missing_data = pd.concat([total, percent_2], axis=1, keys=['Total', '%'])
missing_data.head()

Unnamed: 0,Total,%
Starbucks Locations,0,0.0
ATMs per 1000 Adults,0,0.0


In [25]:
from sklearn.model_selection import cross_val_score
gnb = GaussianNB()
scores = cross_val_score(gnb, X_train, y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.75       0.66666667 0.33333333 0.66666667 0.         0.66666667
 0.66666667 1.         0.33333333 0.33333333]
Mean: 0.5416666666666666
Standard Deviation: 0.2719528145346787


In [26]:
from sklearn.model_selection import cross_val_score
rf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(rf, X_train, y_train, cv=10, scoring = "accuracy")
print("Scores:", scores)
print("Mean:", scores.mean())
print("Standard Deviation:", scores.std())

Scores: [0.5        0.33333333 1.         1.         0.66666667 1.
 0.66666667 0.66666667 0.33333333 0.33333333]
Mean: 0.6499999999999999
Standard Deviation: 0.26299556396765833


From both Naive Bayes and Random Forest, the average accuracy of Random Forest is more than naive bayes with accuracy if 64% with a standard deviation of 3%.

# Evaluation

# Confusion Matrix

In [28]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
predictions = cross_val_predict(rf, X_train, y_train, cv=3)
confusion_matrix(y_train, predictions)

array([[ 6,  8],
       [ 6, 11]], dtype=int64)

**Precision and Recall:**

In [29]:
from sklearn.metrics import precision_score, recall_score

print("Precision:", precision_score(y_train, predictions))
print("Recall:",recall_score(y_train, predictions))

Precision: 0.5789473684210527
Recall: 0.6470588235294118


In [30]:
from sklearn.metrics import f1_score
f1_score(y_train, predictions)

0.6111111111111113