# Sistema RBC para análise de casos similares  com dataset student

### Preparando bibliotecas

* Instalando pycbr

In [1]:
#!pip install pycbr

In [2]:
import pycbr
import pandas as pd
import sklearn
import tempfile

Unable to load a logging configuration file. Using the default settings.


------------------------------------------------------------------
### Carregando base de dados

In [3]:
df_mat = pd.read_csv("student-mat.csv")
df_mat.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


In [4]:
df_mat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 33 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   school      395 non-null    object
 1   sex         395 non-null    object
 2   age         395 non-null    int64 
 3   address     395 non-null    object
 4   famsize     395 non-null    object
 5   Pstatus     395 non-null    object
 6   Medu        395 non-null    int64 
 7   Fedu        395 non-null    int64 
 8   Mjob        395 non-null    object
 9   Fjob        395 non-null    object
 10  reason      395 non-null    object
 11  guardian    395 non-null    object
 12  traveltime  395 non-null    int64 
 13  studytime   395 non-null    int64 
 14  failures    395 non-null    int64 
 15  schoolsup   395 non-null    object
 16  famsup      395 non-null    object
 17  paid        395 non-null    object
 18  activities  395 non-null    object
 19  nursery     395 non-null    object
 20  higher    

------------------------------------------------------------------
### Tratando dados

Para obter casos similares vamos trabalhar com os seguintes atributos:
- Absences: number of school absences (numeric: 0 to 93)
- Internet: Internet access at home (binary: yes or no)
- failures: number of past class failures (numeric: n if 1 ≤ n < 3, else 4)
- sex: student's sex (binary: female or male)
- studytime weekly study time (numeric: 1 – < 2 hours, 2 – 2 to 5 hours, 3 – 5 to 10 hours or 4 – > 10 hours)
- age: student's age (numeric: from 15 to 22)
- G1: first period grade (numeric: from 0 to 20)
- G2: second period grade (numeric: from 0 to 20)

Iremos afirmar se o aluno vai ou não passar baseado na nota final G3, classificando a nota da seguite forma:

                   |    pass    |    fail    |
                   |  20 - 10   |   09 - 0   |

In [5]:
# Classificando as notas em pass ou fail

result_mat = []

for row in df_mat['G3']:
        if row >= 10:
          result_mat.append('pass')
        else:
          result_mat.append('fail')
df_mat['binaryClass'] = result_mat

In [6]:
# Criando base de dados com atributos desejáveis
df_student = pd.DataFrame(df_mat, columns=['sex', 'age','absences','studytime', 'failures', 'internet','G1', 'G2', 'binaryClass'])
features_names = df_student.drop(columns=['binaryClass']).columns
df_student.head()

Unnamed: 0,sex,age,absences,studytime,failures,internet,G1,G2,binaryClass
0,F,18,6,2,0,no,5,6,fail
1,F,17,4,2,0,yes,5,5,fail
2,F,15,10,2,3,yes,7,8,pass
3,F,15,2,3,0,yes,15,14,pass
4,F,16,4,2,0,no,6,10,pass


In [7]:
# Convertendo valores categóricos para numéricos
# for yes / no values:
d = {'yes': 1, 'no': 0}
df_student['internet'] = df_student['internet'].map(d)

# map the sex data
d = {'F': 1, 'M': 0}
df_student['sex'] = df_student['sex'].map(d)

# map the grades data
d = {'pass': 1, 'fail': 0}
df_student['binaryClass'] = df_student['binaryClass'].map(d)

df_student.head()

Unnamed: 0,sex,age,absences,studytime,failures,internet,G1,G2,binaryClass
0,1,18,6,2,0,0,5,6,0
1,1,17,4,2,0,1,5,5,0
2,1,15,10,2,3,1,7,8,1
3,1,15,2,3,0,1,15,14,1
4,1,16,4,2,0,0,6,10,1


In [8]:
# Armazena o dataframe em um arquivo csv
df_student.to_csv("file_student.csv", index = False)

### Construindo sistema RBC

In [9]:
# 1 - Definindo caso base por meio do arquivo csv
case_base = pycbr.casebase.SimpleCSVCaseBase("file_student.csv")

In [10]:
# 2 - Definindo a função similaridade (similaridade linear)
recovery = pycbr.recovery.Recovery([(x, pycbr.models.QuantileLinearAttribute()) for x in features_names])

In [11]:
# 3 - Definindo atributo para agregação das respostas
aggregation = pycbr.aggregate.MajorityAggregate("binaryClass")

In [12]:
# 4 - Cria instância do objeto CBR
cbr = pycbr.CBR(case_base, recovery, aggregation, server_name = 'Student-demo')

[32m2021-06-02 15:50:43[0m [35mLAPTOP-SGQRE9VS[0m [34mnumexpr.utils[12180][0m [1;30mINFO[0m NumExpr defaulting to 8 threads.


### Aplicação WSGI (Web Server Gateway Interface)

In [None]:
app = cbr.app
app.run()

 * Serving Flask app "Student-demo" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


[32m2021-06-02 15:50:43[0m [35mLAPTOP-SGQRE9VS[0m [34mwerkzeug[12180][0m [1;30mINFO[0m  * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
[32m2021-06-02 15:51:01[0m [35mLAPTOP-SGQRE9VS[0m [34mwerkzeug[12180][0m [1;30mINFO[0m 127.0.0.1 - - [02/Jun/2021 15:51:01] "GET / HTTP/1.1" 200 -
[32m2021-06-02 15:51:01[0m [35mLAPTOP-SGQRE9VS[0m [34mwerkzeug[12180][0m [1;30mINFO[0m 127.0.0.1 - - [02/Jun/2021 15:51:01] "GET /swaggerui/droid-sans.css HTTP/1.1" 200 -
[32m2021-06-02 15:51:01[0m [35mLAPTOP-SGQRE9VS[0m [34mwerkzeug[12180][0m [1;30mINFO[0m 127.0.0.1 - - [02/Jun/2021 15:51:01] "GET /swaggerui/swagger-ui.css HTTP/1.1" 200 -
[32m2021-06-02 15:51:01[0m [35mLAPTOP-SGQRE9VS[0m [34mwerkzeug[12180][0m [1;30mINFO[0m 127.0.0.1 - - [02/Jun/2021 15:51:01] "GET /swaggerui/swagger-ui-bundle.js HTTP/1.1" 200 -
[32m2021-06-02 15:51:01[0m [35mLAPTOP-SGQRE9VS[0m [34mwerkzeug[12180][0m [1;30mINFO[0m 127.0.0.1 - - [02/Jun/2021 15:51:01] "GET /swaggerui