In [None]:
import pandas as pd
import json
import seaborn as sns
import matplotlib.pylab as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans


In [None]:
sns.set(rc={"figure.figsize":(18, 9)}) 

In [None]:
with open('data/dados_discentes.json', 'r') as f:
    data = json.load(f)
data

In [None]:
df = {
    'campus': [],
    'ano': [],
    'semestre': [],
    'curso': [],
    'tipo_curso': [],
    'modalidade': [],
    'sexo': [],
    'escolaridade': [],
    'algo': [],
    'cor': [],
    'nascimento': [],
    'status': []
}

for itens in data:
    item = itens['itens']
    df['campus'].append(item[0]['valor'])
    df['ano'].append(item[1]['valor'])
    df['semestre'].append(item[2]['valor'])
    df['curso'].append(item[3]['valor'])
    df['tipo_curso'].append(item[4]['valor'])
    df['modalidade'].append(item[5]['valor'])
    df['sexo'].append(item[6]['valor'])
    df['escolaridade'].append(item[7]['valor'])
    df['algo'].append(item[8]['valor'])
    df['cor'].append(item[9]['valor'])
    df['nascimento'].append(item[10]['valor'])
    df['status'].append(item[11]['valor'])

df.keys()

In [None]:
df = pd.DataFrame(df)
df

In [None]:
classes = []

for renda in df['algo']:
    if renda not in classes:
        classes.append(renda)

rename_renda = lambda x: classes.index(x)

df['renda'] = df['algo'].apply(rename_renda)
df = df.drop(columns=['algo'])


In [None]:
numericos = ['ano', 'semestre', 'nascimento']

df[numericos] = df[numericos].astype(int)

df.info()


In [None]:
categoricos = [col for col in df.columns if col not in numericos]

df_posprocessing = df.copy()
df_posprocessing = df_posprocessing.drop(columns=categoricos)

ohc = pd.get_dummies(df[categoricos])

df_posprocessing[ohc.columns] = ohc
df_posprocessing

In [None]:
df_posprocessing[['semestre']].describe()

In [None]:
for col in numericos:
    df_posprocessing[col] = (df_posprocessing[col] - df_posprocessing[col].min()) / (df_posprocessing[col].max() - df_posprocessing[col].min())

In [None]:
df_posprocessing[numericos]

In [None]:
df_posprocessing.corr()['status_Não Concluído']

## Analise dos Dados

In [None]:

def plot_count(df, column):
    sns.countplot(df[column])
    plt.xticks(rotation=90)

In [None]:
plot_count(df, 'campus')

In [None]:
plot_count(df, 'semestre')

In [None]:
plot_count(df, 'tipo_curso')

In [None]:
plot_count(df, 'modalidade')

In [None]:
plot_count(df, 'sexo')

In [None]:
plot_count(df, 'escolaridade')

In [None]:
plot_count(df, 'cor')


In [None]:
plot_count(df, 'status')

In [None]:
plot_count(df, 'ano')

In [None]:
plot_count(df, 'ano')

In [None]:
sns.displot(df['campus'], kde=True, height=8, aspect=2.5)
plt.xticks(rotation=90)
plt.show()

In [None]:
ax = sns.boxplot(x=df['status'],y=df['nascimento'])
plt.xticks(rotation=90)
plt.ylim([1940,2022])

In [None]:
ax = sns.boxplot(df['status'],df['ano'])
plt.xticks(rotation=90);

In [None]:
ax = sns.boxplot(x=df['modalidade'],y=df['ano'])
plt.xticks(rotation=90);

In [None]:
status = [x  for x in df_posprocessing.keys() if 'status' in x]
campus = [x  for x in df_posprocessing.keys() if 'campus' in x]
sns.set(rc={"figure.figsize":(14, 7)}) 

sns.heatmap(df_posprocessing[status + campus].corr());

In [None]:
status = [x  for x in df_posprocessing.keys() if 'status' in x]
modalidade = [x  for x in df_posprocessing.keys() if 'modalidade' in x]
sns.heatmap(df_posprocessing[status+modalidade].corr());

In [None]:
ax = sns.barplot(y="ano", x="status", data=df)
plt.xticks(rotation=90)
plt.ylim([2005,2025])

In [None]:
status = [x  for x in df_posprocessing.keys() if 'status' in x]
sns.heatmap(df_posprocessing[['ano','nascimento']+status].corr());

In [None]:
distancia = df[df['modalidade'] == 'À Distância']
presencial = df[df['modalidade'] != 'À Distância']

In [None]:
plot_count(distancia, 'ano')

In [None]:
plot_count(presencial, 'ano')

In [None]:
plot_count(distancia, 'status')

In [None]:
plot_count(presencial, 'status')

In [None]:
ax = sns.boxplot(df['campus'],df['ano'])
plt.xticks(rotation=90);

In [None]:
# verificando campus com mais alunos novos em 2013
plot_count(df[df['ano'] == 2015], 'campus')

In [None]:
def entrada_campus(df, campus):
    campus = df[df['campus'] == campus]
    sns.displot(campus['ano'], kde=True, height=8, aspect=2.5)
    plt.xticks(rotation=90)
    plt.show()

In [None]:
entrada_campus(df, 'Câmpus Goiânia')

In [None]:
entrada_campus(df, 'Câmpus Anápolis')

In [None]:
evasao = df[df['status'] == 'Evasão']

In [None]:
evasao.count()

In [None]:
plot_count(evasao, 'renda')

In [None]:
plot_count(df, 'renda')

In [None]:
classes

In [None]:
corpus = df['curso']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

In [None]:
n_clusters = df['curso'].unique().size
kmeans = KMeans(n_clusters=n_clusters-1, random_state=0).fit(X)
kmeans.labels_
#kmeans.predict([[0, 0], [12, 3]])
#kmeans.cluster_centers_