# Seleção de atributos com variância

In [67]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy import stats

from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, KFold

In [2]:
np.random.rand(50)

array([0.15858308, 0.88710602, 0.55250525, 0.16008173, 0.15619666,
       0.24105063, 0.67750926, 0.46387645, 0.56098526, 0.20732289,
       0.08222612, 0.64411437, 0.98289725, 0.48943197, 0.49763972,
       0.33575612, 0.78224156, 0.85135865, 0.38087687, 0.00198717,
       0.68466441, 0.66116771, 0.89681747, 0.11974718, 0.90856998,
       0.83650088, 0.41329253, 0.74268509, 0.78928361, 0.66943268,
       0.95497885, 0.23907706, 0.25405895, 0.72760565, 0.98315058,
       0.55410508, 0.77008593, 0.45711358, 0.0287057 , 0.45335005,
       0.73846002, 0.03551622, 0.25420501, 0.80030372, 0.43127187,
       0.10450712, 0.47539656, 0.21795748, 0.86027569, 0.21021122])

In [3]:
np.random.randint(0, 2)

1

In [4]:
base_selecao = {'a': np.random.rand(20),
                'b': np.array([0.5] * 20),
               'classe':np.random.randint(0, 2, size = 20)}

In [5]:
base_selecao

{'a': array([0.83225578, 0.73328027, 0.84357636, 0.30766032, 0.45960728,
        0.32529375, 0.01290526, 0.02813623, 0.05017483, 0.85429502,
        0.14688701, 0.87560768, 0.92241977, 0.9816894 , 0.95555835,
        0.40940048, 0.72335307, 0.7890246 , 0.91245783, 0.49586941]),
 'b': array([0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
        0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]),
 'classe': array([1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0])}

In [6]:
df = pd.DataFrame(base_selecao)
df.head()

Unnamed: 0,a,b,classe
0,0.832256,0.5,1
1,0.73328,0.5,0
2,0.843576,0.5,1
3,0.30766,0.5,0
4,0.459607,0.5,0


In [7]:
df.describe()

Unnamed: 0,a,b,classe
count,20.0,20.0,20.0
mean,0.582973,0.5,0.6
std,0.340096,0.0,0.502625
min,0.012905,0.5,0.0
25%,0.320885,0.5,0.0
50%,0.728317,0.5,1.0
75%,0.859623,0.5,1.0
max,0.981689,0.5,1.0


In [8]:
from Models.Stats import Stats
cls_stats = Stats()

In [9]:
cls_stats.variance(df['a']), cls_stats.variance(df['b'])

(0.10988184844301166, 0.0)

In [10]:
round(cls_stats.std_deviation(df['a']), 6), round(cls_stats.std_deviation(df['b']), 6)

(0.331484, 0.0)

In [11]:
X = df.iloc[:, 0:2].values
X

array([[0.83225578, 0.5       ],
       [0.73328027, 0.5       ],
       [0.84357636, 0.5       ],
       [0.30766032, 0.5       ],
       [0.45960728, 0.5       ],
       [0.32529375, 0.5       ],
       [0.01290526, 0.5       ],
       [0.02813623, 0.5       ],
       [0.05017483, 0.5       ],
       [0.85429502, 0.5       ],
       [0.14688701, 0.5       ],
       [0.87560768, 0.5       ],
       [0.92241977, 0.5       ],
       [0.9816894 , 0.5       ],
       [0.95555835, 0.5       ],
       [0.40940048, 0.5       ],
       [0.72335307, 0.5       ],
       [0.7890246 , 0.5       ],
       [0.91245783, 0.5       ],
       [0.49586941, 0.5       ]])

In [12]:
y = df.iloc[:,2].values
y

array([1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0])

In [13]:
selecao = VarianceThreshold(threshold=0.06)
X_novo = selecao.fit_transform(X)

In [14]:
X_novo, X_novo.shape

(array([[0.83225578],
        [0.73328027],
        [0.84357636],
        [0.30766032],
        [0.45960728],
        [0.32529375],
        [0.01290526],
        [0.02813623],
        [0.05017483],
        [0.85429502],
        [0.14688701],
        [0.87560768],
        [0.92241977],
        [0.9816894 ],
        [0.95555835],
        [0.40940048],
        [0.72335307],
        [0.7890246 ],
        [0.91245783],
        [0.49586941]]),
 (20, 1))

In [15]:
selecao.variances_

array([0.10988185, 0.        ])

In [16]:
idx = np.where(selecao.variances_ > 0.065)
idx

(array([0], dtype=int64),)

# EXERCÍCIO

O objetivo deste exercício é utilizar a base de dados do crédito e aplicar a técnica de seleção de atributos utilizando variância

* Carregue o arquivo `credit_data.csv`

* Calcule a variância para os atributos <b>income</b>, <b>age</b> e <b>loan</b> e aplique o método de seleção Low Variance

* Faça um teste do accuracy utilizando o algoritmo Naïve Bayes, sem seleção de atributos e com seleção de atributos

In [42]:
df = pd.read_csv('CSVs/credit_data.csv')
df.head()

Unnamed: 0,i#clientid,income,age,loan,c#default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [43]:
df.isnull().sum()

i#clientid    0
income        0
age           3
loan          0
c#default     0
dtype: int64

In [44]:
df.describe()

Unnamed: 0,i#clientid,income,age,loan,c#default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [45]:
df.fillna(df['age'].mean(), inplace = True)
df.isnull().sum()

i#clientid    0
income        0
age           0
loan          0
c#default     0
dtype: int64

In [49]:
for idx in df.query('age < 18').index:
    df.loc[idx, 'age'] = df['age'].mean()
df['age'].value_counts()

40.807559    4
49.499961    1
40.405455    1
45.188194    1
48.917214    1
            ..
50.418527    1
58.348455    1
32.300207    1
23.915705    1
52.374629    1
Name: age, Length: 1997, dtype: int64

In [50]:
df.query('age < 18')

Unnamed: 0,i#clientid,income,age,loan,c#default


In [51]:
inc_var = cls_stats.variance(df['income'])
age_var = cls_stats.variance(df['age'])
loan_var = cls_stats.variance(df['loan'])
inc_var, age_var, loan_var

(205141026.89193964, 175.52427911817242, 9269884.955556015)

In [52]:
cls_stats.std_deviation(df['income']), cls_stats.std_deviation(df['age']), cls_stats.std_deviation(df['loan'])

(14322.745089260636, 13.248557624065059, 3044.648576692557)

In [53]:
X = df.iloc[:,1:4].values
X

array([[66155.92509508,    59.01701507,  8106.53213129],
       [34415.15396582,    48.1171531 ,  6564.74501768],
       [57317.17006303,    63.10804949,  8020.95329639],
       ...,
       [44311.44926231,    28.0171669 ,  5522.78669326],
       [43756.05660491,    63.97179584,  1622.72259832],
       [69436.57955155,    56.15261703,  7378.83359873]])

In [54]:
y = df.iloc[:,4].values
y

array([0, 0, 0, ..., 1, 0, 0], dtype=int64)

In [55]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[0.9231759 , 0.89209175, 0.58883739],
       [0.28812165, 0.65470788, 0.47682695],
       [0.74633429, 0.9811888 , 0.58262011],
       ...,
       [0.48612202, 0.21695807, 0.40112895],
       [0.47500998, 1.        , 0.1177903 ],
       [0.98881367, 0.82970913, 0.53597028]])

In [56]:
X_scaled.var(axis=0)

array([0.08211816, 0.08325256, 0.04892632])

In [57]:
selecao = VarianceThreshold(threshold=0.049)
X_low = selecao.fit_transform(X_scaled)
X_low.shape

(2000, 2)

In [58]:
np.set_printoptions(suppress=True)
selecao.variances_

array([0.08211816, 0.08325256, 0.04892632])

In [59]:
X_low.var(axis=0)

array([0.08211816, 0.08325256])

In [60]:
nb = GaussianNB()
nb.fit(X_scaled, y)
predicted = nb.predict(X_scaled)
score = accuracy_score(y , predicted)
score

0.926

In [61]:
nb = GaussianNB()
nb.fit(X_low, y)
predicted = nb.predict(X_low)
score = accuracy_score(y , predicted)
score

0.8305

## Valores Faltantes

In [63]:
df = pd.read_csv('CSVs/autos.csv', encoding='ISO-8859-1')
df.head()

Unnamed: 0,dateCrawled,name,seller,offerType,price,abtest,vehicleType,yearOfRegistration,gearbox,powerPS,model,kilometer,monthOfRegistration,fuelType,brand,notRepairedDamage,dateCreated,nrOfPictures,postalCode,lastSeen
0,2016-03-24 11:52:17,Golf_3_1.6,privat,Angebot,480,test,,1993,manuell,0,golf,150000,0,benzin,volkswagen,,2016-03-24 00:00:00,0,70435,2016-04-07 03:16:57
1,2016-03-24 10:58:45,A5_Sportback_2.7_Tdi,privat,Angebot,18300,test,coupe,2011,manuell,190,,125000,5,diesel,audi,ja,2016-03-24 00:00:00,0,66954,2016-04-07 01:46:50
2,2016-03-14 12:52:21,"Jeep_Grand_Cherokee_""Overland""",privat,Angebot,9800,test,suv,2004,automatik,163,grand,125000,8,diesel,jeep,,2016-03-14 00:00:00,0,90480,2016-04-05 12:47:46
3,2016-03-17 16:54:04,GOLF_4_1_4__3TÜRER,privat,Angebot,1500,test,kleinwagen,2001,manuell,75,golf,150000,6,benzin,volkswagen,nein,2016-03-17 00:00:00,0,91074,2016-03-17 17:40:17
4,2016-03-31 17:25:20,Skoda_Fabia_1.4_TDI_PD_Classic,privat,Angebot,3600,test,kleinwagen,2008,manuell,69,fabia,90000,7,diesel,skoda,nein,2016-03-31 00:00:00,0,60437,2016-04-06 10:17:21


In [64]:
df.isnull().sum()

dateCrawled                0
name                       0
seller                     0
offerType                  0
price                      0
abtest                     0
vehicleType            37869
yearOfRegistration         0
gearbox                20209
powerPS                    0
model                  20484
kilometer                  0
monthOfRegistration        0
fuelType               33386
brand                      0
notRepairedDamage      72060
dateCreated                0
nrOfPictures               0
postalCode                 0
lastSeen                   0
dtype: int64

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371528 entries, 0 to 371527
Data columns (total 20 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   dateCrawled          371528 non-null  object
 1   name                 371528 non-null  object
 2   seller               371528 non-null  object
 3   offerType            371528 non-null  object
 4   price                371528 non-null  int64 
 5   abtest               371528 non-null  object
 6   vehicleType          333659 non-null  object
 7   yearOfRegistration   371528 non-null  int64 
 8   gearbox              351319 non-null  object
 9   powerPS              371528 non-null  int64 
 10  model                351044 non-null  object
 11  kilometer            371528 non-null  int64 
 12  monthOfRegistration  371528 non-null  int64 
 13  fuelType             338142 non-null  object
 14  brand                371528 non-null  object
 15  notRepairedDamage    299468 non-nu

In [66]:
df['fuelType'].unique()

array(['benzin', 'diesel', nan, 'lpg', 'andere', 'hybrid', 'cng',
       'elektro'], dtype=object)

In [68]:
stats.mode(df['fuelType'])

ModeResult(mode=array(['benzin'], dtype=object), count=array([223857]))

In [69]:
from Models.Stats import Stats
cls_stats = Stats()

In [70]:
cls_stats.mode(df['fuelType'])

(array(['benzin'], dtype=object), 223857)

In [77]:
df['fuelType'].mode()[0]

'benzin'

In [79]:
df['fuelType'].fillna(df['fuelType'].mode()[0], inplace = True)

In [80]:
df.isnull().sum()

dateCrawled                0
name                       0
seller                     0
offerType                  0
price                      0
abtest                     0
vehicleType            37869
yearOfRegistration         0
gearbox                20209
powerPS                    0
model                  20484
kilometer                  0
monthOfRegistration        0
fuelType                   0
brand                      0
notRepairedDamage      72060
dateCreated                0
nrOfPictures               0
postalCode                 0
lastSeen                   0
dtype: int64