In [29]:
# Import the required modules
import pandas as pd
from pathlib import Path
import hvplot.pandas


In [30]:
# Read in the CSV file as a Pandas Dataframe
file_path = 'DATA_SET/Cleaned_data.csv'
df = pd.read_csv(file_path)
df.head()
df.columns

Index(['Marital status', 'Daytime/evening attendance',
       'Previous qualification', 'Nationality', 'Mother qualification',
       'Father qualification', 'Tuition fees up to date', 'Gender',
       'Scholarship holder', 'International', 'Age at enrollment',
       'Curricular units 1st sem (grade)', 'Curricular units 2nd sem (grade)',
       'Target'],
      dtype='object')

In [31]:
#Review the DataFrame
df.head()

Unnamed: 0,Marital status,Daytime/evening attendance,Previous qualification,Nationality,Mother qualification,Father qualification,Tuition fees up to date,Gender,Scholarship holder,International,Age at enrollment,Curricular units 1st sem (grade),Curricular units 2nd sem (grade),Target
0,1,1,1,1,13,10,1,1,0,0,20,0.0,0.0,Dropout
1,1,1,1,1,1,3,0,1,0,0,19,14.0,13.666667,Graduate
2,1,1,1,1,22,27,0,1,0,0,19,0.0,0.0,Dropout
3,1,1,1,1,23,27,1,0,0,0,20,13.428571,12.4,Graduate
4,2,0,1,1,22,28,1,0,0,0,45,12.333333,13.0,Graduate


In [32]:
# Review the info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Marital status                    4424 non-null   int64  
 1   Daytime/evening attendance        4424 non-null   int64  
 2   Previous qualification            4424 non-null   int64  
 3   Nationality                       4424 non-null   int64  
 4   Mother qualification              4424 non-null   int64  
 5   Father qualification              4424 non-null   int64  
 6   Tuition fees up to date           4424 non-null   int64  
 7   Gender                            4424 non-null   int64  
 8   Scholarship holder                4424 non-null   int64  
 9   International                     4424 non-null   int64  
 10  Age at enrollment                 4424 non-null   int64  
 11  Curricular units 1st sem (grade)  4424 non-null   float64
 12  Curric

In [33]:
# Verify the categories of the "Target" column
df["Target"].value_counts()

Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64

In [34]:
# Transform the Target column using get_dummies
Target_dummies = pd.get_dummies(df["Target"], dtype=int)

# Display the transformed data
Target_dummies.head()

Unnamed: 0,Dropout,Enrolled,Graduate
0,1,0,0
1,0,0,1
2,1,0,0
3,0,0,1
4,0,0,1


In [35]:
# Concatenate the df and the Target_dummies DataFrames
df = pd.concat([df, Target_dummies], axis=1)

# Drop the original Target column
df = df.drop(columns=["Target"])

# Display the DataFrame
df.head()

Unnamed: 0,Marital status,Daytime/evening attendance,Previous qualification,Nationality,Mother qualification,Father qualification,Tuition fees up to date,Gender,Scholarship holder,International,Age at enrollment,Curricular units 1st sem (grade),Curricular units 2nd sem (grade),Dropout,Enrolled,Graduate
0,1,1,1,1,13,10,1,1,0,0,20,0.0,0.0,1,0,0
1,1,1,1,1,1,3,0,1,0,0,19,14.0,13.666667,0,0,1
2,1,1,1,1,22,27,0,1,0,0,19,0.0,0.0,1,0,0
3,1,1,1,1,23,27,1,0,0,0,20,13.428571,12.4,0,0,1
4,2,0,1,1,22,28,1,0,0,0,45,12.333333,13.0,0,0,1


In [36]:
# Import the module
from sklearn.preprocessing import StandardScaler

In [37]:
# Scaling the numeric columns
info_data_scaled = StandardScaler().fit_transform(df[["Mother qualification", "Father qualification", "Curricular units 1st sem (grade)","Curricular units 2nd sem (grade)"]])

# Review the scaled data
info_data_scaled

array([[ 0.07511091, -0.58452612, -2.19710239, -1.96348862],
       [-1.25449497, -1.2183802 ,  0.6935986 ,  0.65956171],
       [ 1.07231532,  0.9548338 , -2.19710239, -1.96348862],
       ...,
       [ 1.07231532,  0.9548338 ,  0.88201036,  0.62757329],
       [ 1.07231532,  0.9548338 ,  0.65230287,  0.33967752],
       [ 1.18311581,  0.9548338 ,  0.2118151 ,  0.53160803]])

In [38]:
# Create a DataFrame of the scaled data
info_data_scaled = pd.DataFrame(info_data_scaled, columns=["Mother qualification", "Father qualification", "Curricular units 1st sem (grade)","Curricular units 2nd sem (grade)"])

# Replace the original data with the columns of information from the scaled Data
df["Mother qualification"] = info_data_scaled["Mother qualification"]
df["Father qualification"] = info_data_scaled["Father qualification"]
df["Curricular units 1st sem (grade)"] = info_data_scaled["Curricular units 1st sem (grade)"]
df["Curricular units 2nd sem (grade)"] = info_data_scaled["Curricular units 2nd sem (grade)"]
# Review the DataFrame
df.head()

Unnamed: 0,Marital status,Daytime/evening attendance,Previous qualification,Nationality,Mother qualification,Father qualification,Tuition fees up to date,Gender,Scholarship holder,International,Age at enrollment,Curricular units 1st sem (grade),Curricular units 2nd sem (grade),Dropout,Enrolled,Graduate
0,1,1,1,1,0.075111,-0.584526,1,1,0,0,20,-2.197102,-1.963489,1,0,0
1,1,1,1,1,-1.254495,-1.21838,0,1,0,0,19,0.693599,0.659562,0,0,1
2,1,1,1,1,1.072315,0.954834,0,1,0,0,19,-2.197102,-1.963489,1,0,0
3,1,1,1,1,1.183116,0.954834,1,0,0,0,20,0.575611,0.41645,0,0,1
4,2,0,1,1,1.072315,1.045384,1,0,0,0,45,0.349468,0.531608,0,0,1


In [39]:
# Import the KMeans module from SKLearn
from sklearn.cluster import KMeans

In [40]:
# Create a a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

In [41]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the service_ratings DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=0)
    k_model.fit(df)
    inertia.append(k_model.inertia_)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [42]:
# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,362543.253617
1,2,167703.355168
2,3,113366.354013
3,4,82180.09002
4,5,71265.327575


In [43]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

In [46]:
# Define the model with 4 clusters
model = KMeans(n_clusters=4)

# Fit the model
model.fit(df)

# Make predictions
k_4 = model.predict(df)

# Create a copy of the preprocessed data
info_predictions_df = df.copy()

# Add a class column with the labels
info_predictions_df['Dropout'] = k_4

  super()._check_params_vs_input(X, default_n_init=10)


In [47]:
# Plot the clusters
info_predictions_df.hvplot.scatter(
    x="Dropout",
    y="Age at enrollment",
    by="Dropout"
)