##Classificando o preço de smartphones

#### 1 - Importando os dados

In [0]:
#importando bibliotecas
import pandas as pd
import numpy as np

In [0]:
#lendo os arquivos com Spark
df1 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/danielzinhosoares206@gmail.com/test-1.csv")
df2 = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/danielzinhosoares206@gmail.com/train-1.csv")

In [0]:
#colocando os dados em um DataFrame Pandas
to_predict = df1.toPandas()
train_test = df2.toPandas()

###2 - Ajustando os dataframes

In [0]:
#obtendo uma visão geral do dataframe treino
train_test.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [0]:
#obtendo uma visão geral do dataframe para predição
to_predict.head()

Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,16,226,1412,3476,12,7,2,0,1,0
1,2,841,1,0.5,1,4,1,61,0.8,191,...,12,746,857,3895,6,0,7,1,0,0
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,4,1270,1366,2396,17,10,10,0,1,1
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,20,295,1752,3893,10,0,7,1,1,0
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,18,749,810,1773,15,8,7,1,0,1


In [0]:
id_predict = to_predict['id'] 

#dropando a coluna 'id'
to_predict.drop('id', axis = 1, inplace = True)

###3 - Machine Learning (Testando os modelos para seleção)

####3.1 - Importando as bibliotecas

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

####3.1.2 - Dividindo o dataset para treino e teste

In [0]:
#separando e transformando em array as variáveis dependente (y) das variáveis independentes (x)
x = train_test.iloc[:, :-1].values
y = train_test.iloc[:, -1].values

In [0]:
x_train,x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

####3.2 - KNN

In [0]:
#importando o modelo
from sklearn.neighbors import KNeighborsClassifier

In [0]:
#definindo os parâmetros a serem testados
parametros_knn = {'n_neighbors':np.arange(1, 15)}
#iniciando o modelo
knn = KNeighborsClassifier()

In [0]:
#criando um hiperparametrizador 
knn_CV = GridSearchCV(knn, parametros_knn, cv = 5)

In [0]:
#dando fit nos conjuntos de treino com o modelo hiperparametrizado
knn_CV.fit(x_train, y_train)

Out[116]: GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])})

In [0]:
#predizendo o x_test
y_pred_knn = knn_CV.predict(x_test)

In [0]:
print('O melhor parâmetro foi: {}'.format(knn_CV.best_params_))
print(classification_report(y_pred_knn, y_test))
print('A acurácia foi de: {}'.format(knn_CV.score(x_test, y_test)))

O melhor parâmetro foi: {'n_neighbors': 11}
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       106
           1       0.96      0.92      0.94        95
           2       0.93      0.91      0.92        95
           3       0.93      1.00      0.96       104

    accuracy                           0.95       400
   macro avg       0.95      0.95      0.95       400
weighted avg       0.95      0.95      0.95       400

A acurácia foi de: 0.95


#### 3.3 Random Forest Classifier

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
parametros_RF = {'n_estimators':np.arange(100, 150),
                'bootstrap':[True, False]}
RFClassifier = RandomForestClassifier()

In [0]:
RFClassifier_CV = GridSearchCV(RFClassifier, parametros_RF, cv = 5)

In [0]:
RFClassifier_CV.fit(x_train, y_train)

Out[122]: GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'bootstrap': [True, False],
                         'n_estimators': array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149])})

In [0]:
y_pred_RF = RFClassifier_CV.predict(x_test)

In [0]:
print('O melhor parâmetro foi: {}'.format(RFClassifier_CV.best_params_))
print(classification_report(y_pred_RF, y_test))
print('A acurácia foi de: {}'.format(RFClassifier_CV.score(x_test, y_test)))

O melhor parâmetro foi: {'bootstrap': True, 'n_estimators': 120}
              precision    recall  f1-score   support

           0       0.94      0.95      0.95       104
           1       0.89      0.85      0.87        95
           2       0.83      0.80      0.81        95
           3       0.88      0.92      0.90       106

    accuracy                           0.89       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.89      0.88       400

A acurácia foi de: 0.885


####3.4 - Support Vector Classification (SVC)

In [0]:
#importando o modelo
from sklearn.svm import SVC

In [0]:
#definindo os passos a serem seguidos pela Pipeline
passos = [('scaler', StandardScaler()),
         ('SVM', SVC())]

In [0]:
#criando a Pipeline
pipeline = Pipeline(passos)

In [0]:
#definindo os parâmetros a serem testados
parametros_svc = {'SVM__C': np.arange(1, 10, 100),
                 'SVM__gamma':[0.1, 0.01]}

In [0]:
#criando um hiperparametrizador 
svm_CV = GridSearchCV(pipeline, parametros_svc, cv = 5)

In [0]:
#dando fit nos conjuntos de treino com o modelo hiperparametrizado
svm_CV.fit(x_train, y_train)

Out[130]: GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('SVM', SVC())]),
             param_grid={'SVM__C': array([1]), 'SVM__gamma': [0.1, 0.01]})

In [0]:
#predizendo o x_test
y_pred_svm = svm_CV.predict(x_test)

In [0]:
print('O melhor parâmetro foi: {}'.format(svm_CV.best_params_))
print(classification_report(y_pred_svm, y_test))
print('A acurácia foi de: {}'.format(svm_CV.score(x_test, y_test)))

O melhor parâmetro foi: {'SVM__C': 1, 'SVM__gamma': 0.01}
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       103
           1       0.97      0.91      0.94        97
           2       0.90      0.86      0.88        96
           3       0.90      0.97      0.94       104

    accuracy                           0.94       400
   macro avg       0.94      0.93      0.93       400
weighted avg       0.94      0.94      0.93       400

A acurácia foi de: 0.935


###4.0 - Machine Learning com o modelo selecionado

In [0]:
#dividindo os datasets de treino e teste
x_train = train_test.iloc[:, :-1].values
y_train = train_test.iloc[:, -1].values
x_test = to_predict.iloc[:].values

In [0]:
#criando um hiperparametrizador com o modelo de melhor acurácia
knn_CV2 = GridSearchCV(knn, parametros_knn, cv = 5)

In [0]:
#dando fit nos conjuntos de treino com o modelo hiperparametrizado
knn_CV2.fit(x_train, y_train)

Out[135]: GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])})

In [0]:
#predizendo o x_test
y_predict = knn_CV2.predict(x_test)

In [0]:
#mostrando o resultado
print(y_predict)

['3' '3' '3' '3' '1' '3' '3' '1' '3' '0' '3' '3' '0' '0' '2' '0' '2' '1'
 '3' '2' '1' '3' '1' '1' '3' '0' '2' '0' '3' '0' '2' '0' '3' '0' '0' '1'
 '3' '1' '2' '1' '1' '2' '0' '0' '0' '1' '0' '3' '1' '2' '1' '0' '3' '0'
 '3' '0' '3' '1' '1' '3' '3' '2' '0' '2' '1' '1' '1' '3' '1' '2' '1' '2'
 '2' '3' '3' '0' '2' '0' '2' '3' '0' '3' '3' '0' '3' '0' '3' '1' '3' '0'
 '1' '2' '2' '1' '2' '2' '0' '2' '1' '2' '1' '0' '0' '3' '0' '2' '0' '1'
 '2' '3' '3' '3' '1' '3' '3' '3' '3' '1' '3' '0' '0' '3' '2' '1' '2' '0'
 '3' '2' '3' '1' '0' '2' '1' '1' '3' '1' '1' '0' '3' '2' '1' '3' '1' '3'
 '2' '3' '3' '2' '2' '3' '2' '3' '1' '0' '3' '2' '3' '3' '3' '3' '2' '2'
 '3' '3' '3' '3' '1' '0' '3' '0' '0' '0' '1' '1' '0' '1' '0' '0' '1' '2'
 '1' '0' '0' '1' '1' '2' '2' '1' '0' '0' '0' '1' '0' '3' '1' '0' '2' '2'
 '3' '3' '1' '2' '2' '3' '3' '2' '2' '1' '0' '0' '1' '2' '0' '2' '3' '3'
 '0' '2' '0' '3' '2' '3' '3' '1' '0' '1' '0' '3' '0' '1' '0' '2' '2' '1'
 '2' '0' '3' '0' '3' '1' '2' '0' '0' '2' '1' '3' '3

In [0]:
#adicionando o price_range e id de cada registro no dataset
to_predict['price_range'] = y_predict
to_predict.insert(0, 'Id', id_predict)

In [0]:
#visão geral do dataset já rotulado
to_predict.head()

Unnamed: 0,Id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,226,1412,3476,12,7,2,0,1,0,3
1,2,841,1,0.5,1,4,1,61,0.8,191,...,746,857,3895,6,0,7,1,0,0,3
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,1270,1366,2396,17,10,10,0,1,1,3
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,295,1752,3893,10,0,7,1,1,0,3
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,749,810,1773,15,8,7,1,0,1,1
