In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold 
from sklearn.model_selection import GridSearchCV

In [None]:
pd.set_option('display.max_columns', 64)
pd.set_option('display.max_rows', 64)

In [None]:
# Lendo o arquivo cvs que está inserido no drive.
data_path = '/content/drive/MyDrive/SistemasInteligentes/dataset/'
data_name = 'Data_train_reduced.csv'

data = pd.read_csv(data_path + data_name)

In [None]:
# Imprimindo o dataset, pode receber parametros do tanto de linha que desejo mostrar.
data.head()

Unnamed: 0,Respondent.ID,Product.ID,Product,Instant.Liking,q1_1.personal.opinion.of.this.Deodorant,q2_all.words,q3_1.strength.of.the.Deodorant,q4_1.artificial.chemical,q4_2.attractive,q4_3.bold,q4_4.boring,q4_5.casual,q4_6.cheap,q4_7.clean,q4_8.easy.to.wear,q4_9.elegant,q4_10.feminine,q4_11.for.someone.like.me,q4_12.heavy,q4_13.high.quality,q4_14.long.lasting,q4_15.masculine,q4_16.memorable,q4_17.natural,q4_18.old.fashioned,q4_19.ordinary,q4_20.overpowering,q4_21.sharp,q4_22.sophisticated,q4_23.upscale,q4_24.well.rounded,q5_1.Deodorant.is.addictive,q7,q8.1,q8.2,q8.5,q8.6,q8.7,q8.8,q8.9,q8.10,q8.11,q8.12,q8.13,q8.17,q8.18,q8.19,q8.20,q9.how.likely.would.you.be.to.purchase.this.Deodorant,q10.prefer.this.Deodorant.or.your.usual.Deodorant,q11.time.of.day.would.this.Deodorant.be.appropriate,q12.which.occasions.would.this.Deodorant.be.appropriate,Q13_Liking.after.30.minutes,q14.Deodorant.overall.on.a.scale.from.1.to.10,ValSegb,s7.involved.in.the.selection.of.the.cosmetic.products,s8.ethnic.background,s9.education,s10.income,s11.marital.status,s12.working.status,s13.2,s13a.b.most.often,s13b.bottles.of.Deodorant.do.you.currently.own
0,3800,121,Deodorant B,1,4,1,4,2,5,4,2,3,5,5,5,4,5,3,1,5,1,2,4,5,4,5,1,1,4,1,4,1,1,0,0.0,0,0,,1.0,,,0,0.0,0,,,0,0.0,2,1,1,2,1,7,7,4,1,4,3,1,1,0.0,0,3
1,3801,121,Deodorant B,0,5,1,4,4,2,2,1,3,2,4,4,4,3,1,1,3,4,4,5,3,3,4,2,2,5,4,4,4,4,0,0.0,0,1,,0.0,,,0,0.0,0,,,0,0.0,3,5,3,3,3,8,6,4,1,4,3,1,1,0.0,0,4
2,3802,121,Deodorant B,0,6,1,3,2,5,2,4,2,4,3,5,4,4,4,3,1,2,1,4,2,4,3,2,5,4,4,3,4,3,0,0.0,0,0,,0.0,,,0,0.0,0,,,0,0.0,5,1,3,3,2,5,7,4,1,3,5,1,1,0.0,0,2
3,3803,121,Deodorant B,1,4,0,4,5,5,4,3,5,2,5,3,5,5,5,1,4,3,3,5,5,4,2,5,3,3,5,4,4,3,0,0.0,0,0,,0.0,,,0,0.0,0,,,0,0.0,5,4,1,3,6,8,1,4,1,4,9,1,3,0.0,0,3
4,3804,121,Deodorant B,1,4,1,2,1,3,1,1,3,3,5,3,5,5,5,1,4,4,2,3,5,1,2,4,2,3,1,5,3,2,0,0.0,0,0,,0.0,,,0,0.0,0,,,0,0.0,5,3,3,2,5,4,4,4,1,3,5,1,2,0.0,0,3


In [None]:
# Mostra o total de linhas e colunas que o nossa dataset tem.
data.shape

(2500, 64)

In [None]:
# Mostra os tipos de dados das colunas.
data.dtypes

Respondent.ID                                                int64
Product.ID                                                   int64
Product                                                     object
Instant.Liking                                               int64
q1_1.personal.opinion.of.this.Deodorant                      int64
q2_all.words                                                 int64
q3_1.strength.of.the.Deodorant                               int64
q4_1.artificial.chemical                                     int64
q4_2.attractive                                              int64
q4_3.bold                                                    int64
q4_4.boring                                                  int64
q4_5.casual                                                  int64
q4_6.cheap                                                   int64
q4_7.clean                                                   int64
q4_8.easy.to.wear                                            i

In [None]:
# Pega todas as linhas que não possuirem NaN no nosso dataset
data_without_nan = data.dropna()
print(data_without_nan.shape)

(0, 64)


In [None]:
missing = data.isnull().sum()
missing_percent = ( missing / len( data['Product.ID'] ) ) * 100
print( missing_percent )

Respondent.ID                                               0.0
Product.ID                                                  0.0
Product                                                     0.0
Instant.Liking                                              0.0
q1_1.personal.opinion.of.this.Deodorant                     0.0
q2_all.words                                                0.0
q3_1.strength.of.the.Deodorant                              0.0
q4_1.artificial.chemical                                    0.0
q4_2.attractive                                             0.0
q4_3.bold                                                   0.0
q4_4.boring                                                 0.0
q4_5.casual                                                 0.0
q4_6.cheap                                                  0.0
q4_7.clean                                                  0.0
q4_8.easy.to.wear                                           0.0
q4_9.elegant                            

In [None]:
data.drop('q8.2', axis = 1, inplace = True)
data.drop('q8.8', axis = 1, inplace = True)
data.drop('q8.9', axis = 1, inplace = True)
data.drop('q8.10', axis = 1, inplace = True)
data.drop('q8.17', axis = 1, inplace = True)
data.drop('q8.18', axis = 1, inplace = True)
data.drop('q8.20', axis = 1, inplace = True)
data.drop('Respondent.ID', axis = 1, inplace = True)
data.drop('Product', axis = 1, inplace = True)
data.drop('q1_1.personal.opinion.of.this.Deodorant', axis = 1, inplace = True)

In [None]:
# Subistitui os dados faltantes da coluna pela Média.
data['q8.7'].fillna(data['q8.7'].mean(), inplace = True)
data['q8.12'].fillna(data['q8.12'].mean(), inplace = True)

In [None]:
missing = data.isnull().sum()
missing_percent = ( missing / len( data['Product.ID'] ) ) * 100
print( missing_percent )

Product.ID                                                 0.0
Instant.Liking                                             0.0
q2_all.words                                               0.0
q3_1.strength.of.the.Deodorant                             0.0
q4_1.artificial.chemical                                   0.0
q4_2.attractive                                            0.0
q4_3.bold                                                  0.0
q4_4.boring                                                0.0
q4_5.casual                                                0.0
q4_6.cheap                                                 0.0
q4_7.clean                                                 0.0
q4_8.easy.to.wear                                          0.0
q4_9.elegant                                               0.0
q4_10.feminine                                             0.0
q4_11.for.someone.like.me                                  0.0
q4_12.heavy                                            

In [None]:
y = data['Instant.Liking']
x = data.drop('Instant.Liking', axis = 1)

In [None]:
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 5)

model = LogisticRegression(penalty = 'l2', C = 0.01, dual = True, solver = 'liblinear', max_iter = 1500)

result = cross_val_score(model, x, y, cv = kfold)

result.mean()



0.7528

In [None]:
penalty = ['l1', 'l2']
dual = [True, False]
constant_C = np.array([0.01, 0.1, 0.5, 1, 2, 3, 5, 10, 20, 50, 100, 500, 750, 1000])
solver = ['liblienar', 'lbfgs', 'newton-cg']

values_grid = {'penalty': penalty, 'dual': dual, 'C': constant_C, 'solver': solver}

model = LogisticRegression(max_iter = 1500)

grid = GridSearchCV(estimator = model, param_grid = values_grid, cv = 5)
grid.fit(x, y)

print("Best Acuracy", grid.best_score_)
print("Param Penalty", grid.best_estimator_.penalty)
print("Param Dual", grid.best_estimator_.dual)
print("Param C", grid.best_estimator_.C)
print("Param Solver", grid.best_estimator_.solver)