In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

### Wrangle

#### Acquire

In [None]:
df = pd.read_csv('student/student-mat.csv', sep=';')

#### Summarize

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

#### Numeric Columns

In [8]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,395.0,16.696203,1.276043,15.0,16.0,17.0,18.0,22.0
Medu,395.0,2.749367,1.094735,0.0,2.0,3.0,4.0,4.0
Fedu,395.0,2.521519,1.088201,0.0,2.0,2.0,3.0,4.0
traveltime,395.0,1.448101,0.697505,1.0,1.0,1.0,2.0,4.0
studytime,395.0,2.035443,0.83924,1.0,1.0,2.0,2.0,4.0
failures,395.0,0.334177,0.743651,0.0,0.0,0.0,0.0,3.0
famrel,395.0,3.944304,0.896659,1.0,4.0,4.0,5.0,5.0
freetime,395.0,3.235443,0.998862,1.0,3.0,3.0,4.0,5.0
goout,395.0,3.108861,1.113278,1.0,2.0,3.0,4.0,5.0
Dalc,395.0,1.481013,0.890741,1.0,1.0,1.0,2.0,5.0


#### Object Columns

In [6]:
mask = np.array(df.dtypes == 'object')
mask

array([ True,  True, False,  True,  True,  True, False, False,  True,
        True,  True,  True, False, False, False,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False, False,
       False, False, False, False, False, False])

In [10]:
obj_df = df.iloc[:, mask]

In [12]:
for col in obj_df.columns:
    print(obj_df[col].value_counts(), '\n')

GP    349
MS     46
Name: school, dtype: int64 

F    208
M    187
Name: sex, dtype: int64 

U    307
R     88
Name: address, dtype: int64 

GT3    281
LE3    114
Name: famsize, dtype: int64 

T    354
A     41
Name: Pstatus, dtype: int64 

other       141
services    103
at_home      59
teacher      58
health       34
Name: Mjob, dtype: int64 

other       217
services    111
teacher      29
at_home      20
health       18
Name: Fjob, dtype: int64 

course        145
home          109
reputation    105
other          36
Name: reason, dtype: int64 

mother    273
father     90
other      32
Name: guardian, dtype: int64 

no     344
yes     51
Name: schoolsup, dtype: int64 

yes    242
no     153
Name: famsup, dtype: int64 

no     214
yes    181
Name: paid, dtype: int64 

yes    201
no     194
Name: activities, dtype: int64 

yes    314
no      81
Name: nursery, dtype: int64 

yes    375
no      20
Name: higher, dtype: int64 

yes    329
no      66
Name: internet, dtype: int64 

no    

#### Dummy Variables

In [13]:
# create df with new dummy vars
dummy_df = pd.get_dummies(obj_df, dummy_na=False, drop_first=True)

In [14]:
# concatenate the df with dummies to our original df
# via column (axis=1)
df = pd.concat([df, dummy_df], axis=1)

In [15]:
df.drop(columns=obj_df.columns, inplace=True)

#### Split
Split data into train, validate, test

In [18]:
from sklearn.model_selection import train_test_split

train_validate, test = train_test_split(df, test_size=.2,
                                       random_state=123)
train, validate = train_test_split(train_validate,
                                  test_size=.3, random_state=123)

#### Split into X and y dataframes
- y = G3

In [21]:
X_train = train.drop(columns=['G3'])
X_validate = validate.drop(columns=['G3'])
X_test = test.drop(columns=['G3'])

y_train = train[['G3']]
y_validate = validate[['G3']]
y_test = test[['G3']]

#### Explore

#### Scale

In [22]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(copy=True).fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_validate_scaled = scaler.transform(X_validate)
X_test_scaled = scaler.transform(X_test)

#### Feature Selection
1. SelectKBest
2. RFE: Recursive Deature Elimination

In [26]:
X_train_scaled = pd.DataFrame(X_train_scaled,
                              columns=X_train.columns.values).\
                            set_index([X_train.index.values])
X_train_scaled

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,...,guardian_mother,guardian_other,schoolsup_yes,famsup_yes,paid_yes,activities_yes,nursery_yes,higher_yes,internet_yes,romantic_yes
142,0.000000,1.00,1.00,0.000000,0.666667,0.000000,0.75,0.25,0.25,0.00,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
326,0.333333,0.75,0.75,0.000000,0.000000,0.000000,0.75,0.50,1.00,0.50,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
88,0.166667,0.50,0.50,0.333333,0.333333,0.333333,0.75,0.75,0.25,0.00,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0
118,0.333333,0.25,0.75,0.666667,0.333333,0.333333,1.00,0.25,0.75,0.00,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
312,0.666667,0.25,0.50,0.000000,0.333333,0.333333,0.75,1.00,0.25,0.25,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,0.333333,0.50,0.25,0.333333,0.666667,0.000000,0.50,0.25,0.50,0.00,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
61,0.166667,0.25,0.25,1.000000,0.000000,0.000000,1.00,1.00,1.00,1.00,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
38,0.000000,0.75,1.00,0.000000,0.666667,0.000000,0.75,0.50,0.25,0.00,...,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
243,0.166667,1.00,1.00,0.000000,0.000000,0.000000,1.00,0.50,0.25,0.00,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0


In [27]:
from sklearn.feature_selection import SelectKBest, f_regression

Initialize the f_selector object, defining the scoring method. 

In [33]:
f_selector = SelectKBest(f_regression, k=13)

Fit the object to our X and y data (train)This will score, rank, and ID the top k features

In [32]:
f_selector = f_selector.fit(X_train_scaled, y_train.G3)

f_support

In [None]:
X_reduced_scaled = X_train_scaled.iloc[:, f_support]

In [None]:
f_feature = X_train_scaled.iloc[:,f_support].columns.to_list()