# Eksplorasi dan Visualisasi Data

Notebook ini melakukan eksplorasi dasar pada hasil survei mahasiswa.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [None]:
df = pd.read_csv('cluster_sentiment_output.csv')
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
print('Program Studi')
print(df['Program Studi'].value_counts())
print('
Angkatan')
print(df['Angkatan'].value_counts())

## Menghitung rata-rata tiap mata kuliah

In [None]:

fisika_cols = [c for c in df.columns if '[Fisika' in c]
kimia_cols = [c for c in df.columns if '[Kimia' in c]
biologi_cols = [c for c in df.columns if '[Biologi' in c]
for cols in [fisika_cols, kimia_cols, biologi_cols]:
    df[cols] = df[cols].apply(pd.to_numeric, errors='coerce')

df['avg_fisika'] = df[fisika_cols].mean(axis=1)
df['avg_kimia'] = df[kimia_cols].mean(axis=1)
df['avg_biologi'] = df[biologi_cols].mean(axis=1)
df[['avg_fisika','avg_kimia','avg_biologi']].head()

In [None]:

mean_by_study = df.groupby('Program Studi')[['avg_fisika','avg_kimia','avg_biologi']].mean()
mean_by_study.plot(kind='bar')
plt.ylabel('Rata-rata Skor')
plt.title('Rata-rata Skor Likert per Program Studi')
plt.show()

In [None]:

cluster_counts = df['cluster'].value_counts().sort_index()
cluster_counts.plot(kind='bar')
plt.xlabel('Cluster')
plt.ylabel('Jumlah Responden')
plt.title('Distribusi Cluster')
plt.show()

In [None]:

df['sentiment'].hist(bins=20)
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frekuensi')
plt.title('Distribusi Sentiment')
plt.show()

## Korelasi Antar Pertanyaan Likert

In [None]:

# Korelasi antar pertanyaan Likert
likert_cols = [c for c in df.columns if '[' in c and 'Pernah' not in c]
likert_df = df[likert_cols].apply(pd.to_numeric, errors='coerce')
plt.figure(figsize=(12,10))
sns.heatmap(likert_df.corr(), cmap='coolwarm', center=0)
plt.title('Korelasi Pertanyaan Likert')
plt.show()


## Profil Tiap Cluster

In [None]:

# Profil rata-rata skor tiap mata kuliah per cluster
cluster_profile = df.groupby('cluster')[['avg_fisika','avg_kimia','avg_biologi']].mean()
cluster_profile.plot(kind='bar')
plt.ylabel('Rata-rata Skor')
plt.title('Profil Cluster berdasarkan Mata Kuliah')
plt.show()


## Distribusi Cluster menurut Status Pernah Mengambil Mata Kuliah

In [None]:

# Distribusi cluster berdasarkan status pernah mengambil mata kuliah
for course in ['Fisika','Kimia','Biologi']:
    col = f'Pernah_{course}'
    ctab = pd.crosstab(df[col], df['cluster'])
    ctab.plot(kind='bar', stacked=True)
    plt.title(f'Distribusi Cluster vs {col}')
    plt.xlabel(col)
    plt.ylabel('Jumlah')
    plt.show()


## Boxplot Sentimen

In [None]:

# Boxplot sentimen per Program Studi dan Angkatan
plt.figure(figsize=(12,6))
sns.boxplot(x='Program Studi', y='sentiment', data=df)
plt.title('Sebaran Sentimen per Program Studi')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(12,6))
sns.boxplot(x='Angkatan', y='sentiment', data=df)
plt.title('Sebaran Sentimen per Angkatan')
plt.show()
