# Credit - $R^2$ adjusted

In [4]:
import pandas as pd 
import statsmodels.formula.api as smf
import numpy as np

df = pd.read_csv("Credit.csv").drop("Unnamed: 0", axis=1)

df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


Wir beginnen mit dem leeren Modell, addieren eine Variable und berechnen die adjusted-$R^2$-Wert und wählen denjenigen Wert aus, der den grössten adjusted-$R^2$-Wert hat.

In [5]:
predictors = set(df.columns)
predictors.remove("Balance")
selected = []
for candidate in predictors:
    formula = "{} ~ {}".format("Balance", ' + '.join(selected+[candidate]))
    score = smf.ols(formula, data=df).fit().rsquared_adj
    print("{:<10}{}".format(candidate,score))

Rating    0.7452098462445804
Cards     0.004980917848186994
Student   0.06474609513806151
Ethnicity -0.0048178736277662
Income    0.2130048913136423
Limit     0.741875250785776
Gender    -0.0020502712240353382
Education -0.0024474105101712773
Age       -0.002509186691383336
Married   -0.002480293447217896


Dies ist hier Rating (die negativen Werte müssen hier nicht interessieren).

Wir fügen Rating zum Modell hinzu. Dann addieren wir wieder je eine Variable zum Modell hinzu und berechnen wieder den adjusted-$R^2$-Wert. 

In [6]:
predictors.remove("Rating")
selected.append("Rating")
for candidate in predictors:
    formula = "{} ~ {}".format("Balance", ' + '.join(selected+[candidate]))
    score = smf.ols(formula, data=df).fit().rsquared_adj
    print("{:<10}{}".format(candidate,score))

Cards     0.7462194430599269
Student   0.812911210164568
Ethnicity 0.7445334939427893
Income    0.8744888189724805
Limit     0.7446629109432299
Gender    0.7447595011109145
Education 0.7448926944465679
Age       0.7523031839384496
Married   0.7459766976776195


Wählen wieder die Variable aus mit dem grössten adjusted-$R^2$-Wert. Das ist hier Income.

In [7]:
predictors.remove("Income")
selected.append("Income")
for candidate in predictors:
    formula = "{} ~ {}".format("Balance", ' + '.join(selected+[candidate]))
    score = smf.ols(formula, data=df).fit().rsquared_adj
    print("{:<10}{}".format(candidate,score))

Cards     0.8741968933803232
Student   0.9494990734177157
Ethnicity 0.8746562503488495
Limit     0.8753013618810309
Gender    0.8741831949149261
Education 0.8744205890403014
Age       0.8752504838117935
Married   0.8753076105349855


Usw.

In [8]:
predictors.remove("Student")
selected.append("Student")
for candidate in predictors:
    formula = "{} ~ {}".format("Balance", ' + '.join(selected+[candidate]))
    score = smf.ols(formula, data=df).fit().rsquared_adj
    print("{:<10}{}".format(candidate,score))

Cards     0.9494982778794964
Ethnicity 0.9495107538886861
Limit     0.9517033272701592
Gender    0.9495169605811011
Education 0.9493879846987021
Age       0.9499056260936986
Married   0.949527912161671


In [9]:
predictors.remove("Limit")
selected.append("Limit")
for candidate in predictors:
    formula = "{} ~ {}".format("Balance", ' + '.join(selected+[candidate]))
    score = smf.ols(formula, data=df).fit().rsquared_adj
    print("{:<10}{}".format(candidate,score))

Cards     0.9535788787498292
Ethnicity 0.9516706637673418
Gender    0.9517409816291695
Education 0.9516503264995197
Age       0.9520364506030811
Married   0.9516607130805078


In [10]:
predictors.remove("Cards")
selected.append("Cards")
for candidate in predictors:
    formula = "{} ~ {}".format("Balance", ' + '.join(selected+[candidate]))
    score = smf.ols(formula, data=df).fit().rsquared_adj
    print("{:<10}{}".format(candidate,score))

Ethnicity 0.9535201161299285
Gender    0.9535973858351448
Education 0.953529040191287
Age       0.9539960984035273
Married   0.9534983309007002


In [11]:
predictors.remove("Age")
selected.append("Age")
for candidate in predictors:
    formula = "{} ~ {}".format("Balance", ' + '.join(selected+[candidate]))
    score = smf.ols(formula, data=df).fit().rsquared_adj
    print("{:<10}{}".format(candidate,score))

Ethnicity 0.9539001369612248
Gender    0.9540098163629884
Education 0.9539420006322936
Married   0.9539445218509882


Wenn wir Gender hinzufügen ist der adjusted-$R^2$-Wert 0.95401. Fügen wir noch eine Variable dazu, dann wird der adjusted-$R^2$-Wert _kleiner_. Also brechen wir das Verfahren ab. Die restlichen Variablen werden nicht mehr berücksichtig. 

In [12]:
predictors.remove("Gender")
selected.append("Gender")
for candidate in predictors:
    formula = "{} ~ {}".format("Balance", ' + '.join(selected+[candidate]))
    score = smf.ols(formula, data=df).fit().rsquared_adj
    print("{:<10}{}".format(candidate,score))

Ethnicity 0.953919880388569
Education 0.9539575227550161
Married   0.953954956694095


Hier noch eine Variante, die das ganze Verfahren automatisiert.

In [12]:
import statsmodels.formula.api as smf

def forward_selected(data, response):
    """Linear model designed by forward selection.

    Parameters:
    -----------
    data : pandas DataFrame with all possible predictors and response

    response: string, name of response column in data

    Returns:
    --------
    model: an "optimal" fitted statsmodels linear model
           with an intercept
           selected by forward selection
           evaluated by adjusted R-squared
    """
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = 0.0, 0.0
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {}".format(response,
                                           ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().rsquared_adj
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop()
        if current_score < best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {}".format(response,
                                   ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model

model = forward_selected(df, "Balance")
print(model.model.formula)

Balance ~ Rating + Income + Student + Limit + Cards + Age + Gender
