In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
data = pd.read_csv("data1.csv")

In [4]:
X = data.drop(columns=['MonthlyCharges','customerID'],axis=1)
y = data['MonthlyCharges']

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 28 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   customerID                               7032 non-null   object 
 1   gender                                   7032 non-null   int64  
 2   SeniorCitizen                            7032 non-null   int64  
 3   Partner                                  7032 non-null   int64  
 4   Dependents                               7032 non-null   int64  
 5   tenure                                   7032 non-null   float64
 6   PhoneService                             7032 non-null   int64  
 7   MultipleLines                            7032 non-null   int64  
 8   OnlineSecurity                           7032 non-null   int64  
 9   OnlineBackup                             7032 non-null   int64  
 10  DeviceProtection                         7032 no

In [6]:
#############################################################################

# Select K-best features#

####################################################3

import sklearn.feature_selection as fs
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import sklearn.metrics as metrics



In [8]:
f1_list = []
for k in range(1, 26):
    bk = fs.SelectKBest(fs.f_regression, k=k)
    bk.fit(X, y)
    X_trans = bk.transform(X)
    train_x, test_x, train_y, test_y = train_test_split(X_trans,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)
    lr = LinearRegression()
    lr.fit(train_x, train_y)
    y_pred = lr.predict(test_x)
    f1 = metrics.r2_score(test_y, y_pred)
    f1_list.append(f1)


In [9]:
fig, axe = plt.subplots(dpi = 300)
axe.plot(range(1, 26), f1_list)
axe.set_xlabel("best k features")
axe.set_ylabel("F1-score")
fig.savefig("img.png")
plt.close(fig)


In [16]:
# choose the f_classif as the metric and K is 3
bk = fs.SelectKBest(fs.f_classif, k=13)
bk.fit(X, y)
X_trans = bk.transform(X)

print(bk.get_feature_names_out)

print(bk.get_params)

print (bk.scores_)
print (bk.pvalues_)


<bound method SelectorMixin.get_feature_names_out of SelectKBest(k=13)>


In [20]:
#########################################################################
#SequentialFeatureSelector - farward
#########################################################################

# pip install mlxtend

#importing the necessary libraries
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
# Sequential Forward Selection(sfs)
sfs = SFS(LinearRegression(),
          k_features=10,
          forward=True,
          floating=False,
          scoring = 'r2',
          cv = 0)

sfs.fit(X, y)

print(sfs.k_feature_names_)     # to get the final set of features



('PhoneService', 'MultipleLines', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'TotalCharges', 'InternetService_DSL', 'InternetService_Fiber optic')


In [21]:
#########################################################################
#SequentialFeatureSelector - farward
#########################################################################

# pip install mlxtend

#importing the necessary libraries
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
# Sequential Forward Selection(sfs)
sfs = SFS(LinearRegression(),
          k_features=10,
          forward=True,
          floating=False,
          scoring = 'neg_mean_absolute_error',
          cv = 0)

sfs.fit(X, y)

print(sfs.k_feature_names_)     # to get the final set of features



('PhoneService', 'MultipleLines', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'TotalCharges', 'InternetService_DSL', 'InternetService_No')


In [22]:
#########################################################################
#SequentialFeatureSelector - backward
#########################################################################

# pip install mlxtend

#importing the necessary libraries
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
# Sequential Forward Selection(sfs)
sfs = SFS(LinearRegression(),
          k_features=10,
          forward=False,
          floating=False,
          scoring = 'neg_mean_absolute_error',
          cv = 0)

sfs.fit(X, y)

print(sfs.k_feature_names_)     # to get the final set of features



('PhoneService', 'MultipleLines', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'InternetService_DSL', 'InternetService_Fiber optic')


In [32]:
############## Variable selection using RFE method
## RFE - Recursive Feature Elimination
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, RFE, SelectFromModel
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

lr = LinearRegression()
rfe = RFE(lr, n_features_to_select=5)

rfe = rfe.fit(X, y.values.ravel())

rfe_support=rfe.get_support()
print(rfe.get_support())
print(rfe.support_)
print(rfe.ranking_)


print("Selected Features",X.columns[rfe.support_])



[False False False False False  True False False False False False  True
  True False False False False  True  True False False False False False
 False False]
[False False False False False  True False False False False False  True
  True False False False False  True  True False False False False False
 False False]
[12 16 10 15  8  1  5  2  6  3  4  1  1 13  7 19  9  1  1 22 14 20 17 18
 21 11]
Selected Features Index(['PhoneService', 'StreamingTV', 'StreamingMovies',
       'InternetService_Fiber optic', 'InternetService_No'],
      dtype='object')
