In [33]:
import pandas as pd
import plotly as plt
import plotly.express as px
import sklearn as sc
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import plotly.graph_objects as go

## Diákok teljesítményének becslése

hivatkozás: https://www.kaggle.com/datasets/lainguyn123/student-performance-factors?select=StudentPerformanceFactors.csv

In [34]:
df = pd.read_csv("data/StudentPerformanceFactors.csv", header=0)
df

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,High,Medium,No,7,76,Medium,Yes,1,High,Medium,Public,Positive,2,No,High School,Near,Female,68
6603,23,76,High,Medium,No,8,81,Medium,Yes,3,Low,High,Public,Positive,2,No,High School,Near,Female,69
6604,20,90,Medium,Low,Yes,6,65,Low,Yes,3,Low,Medium,Public,Negative,2,No,Postgraduate,Near,Female,68
6605,10,86,High,High,Yes,6,91,High,Yes,2,Low,Medium,Private,Positive,3,No,High School,Far,Female,68


In [35]:
df.describe()

Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score
count,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0
mean,19.975329,79.977448,7.02906,75.070531,1.493719,2.96761,67.235659
std,5.990594,11.547475,1.46812,14.399784,1.23057,1.031231,3.890456
min,1.0,60.0,4.0,50.0,0.0,0.0,55.0
25%,16.0,70.0,6.0,63.0,1.0,2.0,65.0
50%,20.0,80.0,7.0,75.0,1.0,3.0,67.0
75%,24.0,90.0,8.0,88.0,2.0,4.0,69.0
max,44.0,100.0,10.0,100.0,8.0,6.0,101.0


## Adatfeltárás

A lineáris (pearson) és a monoton (spearman) korrelációk kirajzolása adatfeltárás céljából.

In [36]:
df.select_dtypes(include=['int64', 'float64']).columns

Index(['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores',
       'Tutoring_Sessions', 'Physical_Activity', 'Exam_Score'],
      dtype='object')

In [37]:
# Identify columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

# Encode categorical columns
df_encoded = df.copy()
df_encoded = df[numerical_cols]
# encoder = OrdinalEncoder()
# df_encoded[categorical_cols] = encoder.fit_transform(df_encoded[categorical_cols])

# Normalize numeric columns
# scaler = StandardScaler()
# df_encoded[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])
# df_encoded = df

# Compute correlation matrices
# pearson_corr = df_encoded.corr(method='pearson')
spearman_corr = df_encoded.corr(method='spearman')

# Function to plot correlation matrix
def plot_correlation_matrix(corr_matrix, title):
    fig = go.Figure(data=go.Heatmap(
        z=corr_matrix.values,
        x=corr_matrix.columns,
        y=corr_matrix.index,
        colorscale='RdBu',
        zmin=-1,
        zmax=1,
        colorbar=dict(title="Correlation")
    ))

    for i in range(len(corr_matrix)):
        for j in range(len(corr_matrix.columns)):
            fig.add_annotation(
                x=corr_matrix.columns[j],
                y=corr_matrix.index[i],
                text=f"{corr_matrix.iloc[i, j]:.2f}",
                showarrow=False,
                font=dict(size=18, color="black")  # Bigger font size
            )

    fig.update_layout(
        title=title,
        xaxis_title="Features",
        yaxis_title="Features",
        xaxis_showgrid=False,
        yaxis_showgrid=False,
        xaxis=dict(scaleanchor='y', ticks=''),  # Square aspect
        yaxis=dict(ticks=''),
        margin=dict(l=80, r=80, t=100, b=80),
        width=1200,
        height=1200,
        font=dict(size=24)  # Larger overall font
    )

    fig.show()

# Plot both matrices
# plot_correlation_matrix(pearson_corr, "Normalized Pearson Correlation Matrix")
plot_correlation_matrix(spearman_corr, "Normalized Spearman Correlation Matrix")

## Lineáris regresszió használata a várható vizsgaeredmények becslésére

In [38]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

In [39]:
df = pd.read_csv("data/StudentPerformanceFactors.csv")

In [40]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

In [41]:
df_scaled = df.copy()

scaler = StandardScaler()
features = numerical_cols.drop('Exam_Score')  # exclude target from predictors
X = df_scaled[features]
y = df_scaled['Exam_Score']

In [42]:
X = scaler.fit_transform(X)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
model = LinearRegression()
model.fit(X_train, y_train)

In [45]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

In [46]:
feature_importance = pd.Series(model.coef_, index=features).sort_values(ascending=False)

In [47]:
# Output results
print(f"R² Score: {r2:.3f}")
print(f"Mean Squared Error: {mse:.3f}")
print("\nFeature Importance:")
print(feature_importance)

R² Score: 0.642
Mean Squared Error: 5.065

Feature Importance:
Attendance           2.293932
Hours_Studied        1.732763
Previous_Scores      0.694174
Tutoring_Sessions    0.627378
Physical_Activity    0.155811
Sleep_Hours         -0.049207
dtype: float64


In [48]:
model.intercept_

np.float64(67.23534320551069)

In [49]:
model.coef_

array([ 1.73276276,  2.29393233, -0.04920665,  0.69417406,  0.62737833,
        0.15581097])

### Órai részvétel vs vizsgaeredmények

In [50]:
attendance_range = np.linspace(df['Attendance'].min(), df['Attendance'].max(), 100)

# leskálázás
attendance_range_standard = (attendance_range - attendance_range.mean()) / attendance_range.std()

y_pred_line = attendance_range_standard * feature_importance['Attendance'] + model.intercept_

In [51]:
fig = px.scatter(
    df,
    x='Attendance',
    y='Exam_Score',
    title='Scatter Plot: Attendance vs Exam Score with Regression Line',
    labels={
        'Attendance': 'Attendance (%)',
        'Exam_Score': 'Exam Score'
    }
)

fig.add_trace(go.Scatter(
    x=attendance_range,
    y=y_pred_line,
    mode='lines',
    line=dict(color='red'),
    hovertext=[f"Predicted Score: {score:.2f}" for score in y_pred_line]
))

fig.update_layout(showlegend=False,
        width=600,
        height=600,)
fig.show()

### Tanulással töltött órák vs vizsgaeredmények

In [52]:
hours_studied_range = np.linspace(df['Hours_Studied'].min(), df['Hours_Studied'].max(), 100)

# leskálázás
hours_studied_range_standard = (hours_studied_range - hours_studied_range.mean()) / hours_studied_range.std()

y_pred_line = hours_studied_range_standard * feature_importance['Hours_Studied'] + model.intercept_


In [53]:
fig = px.scatter(
    df,
    x='Hours_Studied',
    y='Exam_Score',
    title='Scatter Plot: Hours_Studied vs Exam Score with Regression Line',
    labels={
        'Hours_Studied': 'Hours_Studied',
        'Exam_Score': 'Exam Score'
    }
)

fig.add_trace(go.Scatter(
    x=hours_studied_range,
    y=y_pred_line,
    mode='lines',
    line=dict(color='red'),
    hovertext=[f"Predicted Score: {score:.2f}" for score in y_pred_line]
))

fig.update_layout(showlegend=False,
        width=600,
        height=600,)
fig.show()

### Alvás vs. vizsgaeredmények (kevésbé befolyásoló jellemző)

In [54]:
sleep_range = np.linspace(df['Sleep_Hours'].min(), df['Sleep_Hours'].max(), 100)

# leskálázás
sleep_range_standard = (sleep_range - sleep_range.mean()) / sleep_range.std()

y_pred_line = sleep_range_standard * feature_importance['Sleep_Hours'] + model.intercept_

In [55]:
fig = px.scatter(
    df,
    x='Sleep_Hours',
    y='Exam_Score',
    title='Scatter Plot: Sleep_Hours vs Exam Score with Regression Line',
    labels={
        'Hours_Studied': 'Sleep_Hours',
        'Exam_Score': 'Exam Score'
    }
)

fig.add_trace(go.Scatter(
    x=sleep_range,
    y=y_pred_line,
    mode='lines',
    line=dict(color='red'),
    hovertext=[f"Predicted Score: {score:.2f}" for score in y_pred_line]
))

fig.update_layout(showlegend=False,
        width=600,
        height=600,)
fig.show()

### Most skálázás nélkül

In [56]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = df.select_dtypes(include=['object', 'bool']).columns

In [57]:
df_copy = df.copy()
df_copy = df_copy[numerical_cols]

In [58]:
features = numerical_cols.drop('Exam_Score')
X = df_copy[features]
y = df_encoded['Exam_Score']

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [60]:
model = LinearRegression()
model.fit(X_train, y_train)

In [61]:
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

In [62]:
feature_importance = pd.Series(model.coef_, index=features).sort_values(ascending=False)

In [63]:
print(f"R² Score: {r2:.3f}")
print(f"Mean Squared Error: {mse:.3f}")
print("\nFeature Importance:")
print(feature_importance)

R² Score: 0.642
Mean Squared Error: 5.065

Feature Importance:
Tutoring_Sessions    0.509866
Hours_Studied        0.289269
Attendance           0.198667
Physical_Activity    0.151104
Previous_Scores      0.048211
Sleep_Hours         -0.033519
dtype: float64
