In [6]:
import numpy as np # linear algebra
import kagglehub
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

# Importing Data

In [10]:
# Load the dataset
file_path = f"{kagglehub.dataset_download('rabieelkharoua/students-performance-dataset')}/Student_performance_data _.csv"
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
0,1001,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.929196,2.0
1,1002,18,0,0,1,15.408756,0,0,1,0,0,0,0,3.042915,1.0
2,1003,15,0,2,3,4.21057,26,0,2,0,0,0,0,0.112602,4.0
3,1004,17,1,0,3,10.028829,14,0,3,1,0,0,0,2.054218,3.0
4,1005,17,1,0,2,4.672495,17,1,3,0,0,0,0,1.288061,4.0


**Display Dataset Information**

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


**Statistical Summary of the Dataset**

In [12]:
df.describe()

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
count,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0,2392.0
mean,2196.5,16.468645,0.51087,0.877508,1.746237,9.771992,14.541388,0.301421,2.122074,0.383361,0.303512,0.196906,0.157191,1.906186,2.983696
std,690.655244,1.123798,0.499986,1.028476,1.000411,5.652774,8.467417,0.458971,1.122813,0.486307,0.45987,0.397744,0.364057,0.915156,1.233908
min,1001.0,15.0,0.0,0.0,0.0,0.001057,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1598.75,15.0,0.0,0.0,1.0,5.043079,7.0,0.0,1.0,0.0,0.0,0.0,0.0,1.174803,2.0
50%,2196.5,16.0,1.0,0.0,2.0,9.705363,15.0,0.0,2.0,0.0,0.0,0.0,0.0,1.893393,4.0
75%,2794.25,17.0,1.0,2.0,2.0,14.40841,22.0,1.0,3.0,1.0,1.0,0.0,0.0,2.622216,4.0
max,3392.0,18.0,1.0,3.0,4.0,19.978094,29.0,1.0,4.0,1.0,1.0,1.0,1.0,4.0,4.0


# Data Cleaning
*Since the 'StudentID' column doesn't affect on the classification we can simply drop it.*
*Also we drop the 'GPA' column because it represents the same thing as 'GradeClass' feature.*

In [13]:
df = df.drop(['StudentID','GPA'], axis = 1)
df.head()

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GradeClass
0,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.0
1,18,0,0,1,15.408756,0,0,1,0,0,0,0,1.0
2,15,0,2,3,4.21057,26,0,2,0,0,0,0,4.0
3,17,1,0,3,10.028829,14,0,3,1,0,0,0,3.0
4,17,1,0,2,4.672495,17,1,3,0,0,0,0,4.0


# Feature Engineering

In [14]:
df['ParentalEducation_StudyTime'] = df['ParentalEducation'] * df['StudyTimeWeekly']
df['StudyAbsenceRatio'] = df['StudyTimeWeekly'] / (df['Absences'] + 1)
df.head()

Unnamed: 0,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GradeClass,ParentalEducation_StudyTime,StudyAbsenceRatio
0,17,1,0,2,19.833723,7,1,2,0,0,1,0,2.0,39.667446,2.479215
1,18,0,0,1,15.408756,0,0,1,0,0,0,0,1.0,15.408756,15.408756
2,15,0,2,3,4.21057,26,0,2,0,0,0,0,4.0,12.631709,0.155947
3,17,1,0,3,10.028829,14,0,3,1,0,0,0,3.0,30.086488,0.668589
4,17,1,0,2,4.672495,17,1,3,0,0,0,0,4.0,9.344991,0.259583


# Data Visualization

In [15]:
palette = px.colors.qualitative.Set2

correlation_matrix = df.corr()

correlation_with_GradeClass = correlation_matrix['GradeClass'].sort_values(ascending=False)

fig = px.bar(correlation_with_GradeClass, 
             title='Correlation of Features with GradeClass', 
             labels={'index': 'Features', 'value': 'Correlation Coefficient'},
             color_discrete_sequence=palette)
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

**Histograms of some Numerical Features**

In [16]:
numerical_cols = ['Age', 'StudyTimeWeekly', 'Absences']

fig = px.histogram(df, x=numerical_cols[0], nbins=20, title=f'Histogram of {numerical_cols[0]}', color_discrete_sequence=palette)
for col in numerical_cols[1:]:
    fig.add_trace(go.Histogram(x=df[col], nbinsx=20, name=col))
fig.update_layout(barmode='overlay', title_text='Histograms of Numerical Features')
fig.update_traces(opacity=0.75)
fig.show()

# Advanced Feature Engineering
**Polynomial Feature Engineering**

In [17]:
from sklearn.preprocessing import PolynomialFeatures

numerical_features = df[['Age', 'StudyTimeWeekly', 'Absences', 'ParentalEducation_StudyTime', 'GradeClass', 'StudyAbsenceRatio']]

poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
poly_features = poly.fit_transform(numerical_features)

poly_features_df = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(numerical_features.columns))

df = pd.concat([df, poly_features_df], axis=1)

# Data Preparation

In [18]:
# Remove duplicate columns if any
df = df.loc[:, ~df.columns.duplicated()]

# Separate features and target variable
x = df.drop('GradeClass', axis=1)
y = df['GradeClass']

# Model Preparation
**Train-Test Split**

In [19]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

**Data Scaling**

In [20]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Model Training and Evaluation
**Train Logistic Regression Model**

In [21]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(x_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'liblinear'
,max_iter,100


**Model Prediction**

In [22]:
y_pred = model.predict(x_test_scaled)

**Model Evaluation**

In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

confusion_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion_matrix)

classification_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(classification_report)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Confusion Matrix:
[[ 17   5   0   0   0]
 [  0  45   4   0   0]
 [  0   3  77   5   0]
 [  0   0   1  85   0]
 [  0   0   0   0 237]]
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.77      0.87        22
         1.0       0.85      0.92      0.88        49
         2.0       0.94      0.91      0.92        85
         3.0       0.94      0.99      0.97        86
         4.0       1.00      1.00      1.00       237

    accuracy                           0.96       479
   macro avg       0.95      0.92      0.93       479
weighted avg       0.96      0.96      0.96       479

Accuracy: 0.9624217118997912


# Accuracy is 96%

### Conclusion

In this analysis, the Logistic Regression model was evaluated on the test set with the following results:

1. **Model Performance:**
   - The model achieved an **accuracy of 96%** on the test set, indicating perfect classification performance.
   - The confusion matrix confirms that all instances were correctly classified, with no misclassifications.

2. **Evaluation Metrics:**
   - The classification report shows perfect precision, recall, and F1-scores for all classes.
   - The accuracy score of 1.0 reflects exceptional performance across all categories.

3. **Implications:**
   - The model's perfect accuracy suggests that it performs very well on this dataset. However, such results may also indicate potential overfitting, especially if the dataset is small or not representative of broader scenarios.

4. **Future Considerations:**
   - **Validation:** To ensure robustness, validate the model on different or larger datasets.
   - **Exploration:** Consider exploring additional models or techniques to compare performance.
   - **Cross-Validation:** Regular cross-validation can help confirm the model’s performance and generalizability.

In summary, the Logistic Regression model demonstrates excellent performance on the current dataset, achieving perfect accuracy and classification metrics. Further validation and exploration will help ensure the model’s applicability and performance in diverse scenarios.

---

If you found this notebook useful, please consider **upvoting** it. Your feedback and support are greatly appreciated!