In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [2]:
# Importamos la base de datos
df = pd.read_csv('student_performance.csv')
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [3]:
# Normalizamos los datos
df.loc[df['Extracurricular Activities'] == 'Yes', 'Extracurricular Activities'] = 1
df.loc[df['Extracurricular Activities'] == 'No', 'Extracurricular Activities'] = 0
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0


In [4]:
# Información de los datos
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [5]:
# Descripción de los datos
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,6.5306,4.5833,55.2248
std,2.589309,17.343152,1.695863,2.867348,19.212558
min,1.0,40.0,4.0,0.0,10.0
25%,3.0,54.0,5.0,2.0,40.0
50%,5.0,69.0,7.0,5.0,55.0
75%,7.0,85.0,8.0,7.0,71.0
max,9.0,99.0,9.0,9.0,100.0


## Entrenanamiento - Todos las variables

In [6]:
# Separamos la información
X = df.drop('Performance Index', axis=1)
y = df['Performance Index']

In [7]:
# Entrenamos el modelo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)


## Evaluación de resultados

In [9]:
# Evaluación de predicción
y_pred = regr.predict(X_test)

In [10]:
# Valores relaes vs. predicción
pd.DataFrame({
    'Actual Performance': y_test,
    'Predicted Performance': y_pred
})

Unnamed: 0,Actual Performance,Predicted Performance
6252,51.0,54.711854
4684,20.0,22.615513
1731,46.0,47.903145
4742,28.0,31.289767
4521,41.0,43.004570
...,...,...
6412,45.0,46.886280
8285,66.0,62.698025
7853,16.0,16.793420
1095,65.0,63.343274


In [11]:
print('Coeficientes: \n', [regr.intercept_, regr.coef_])

# Calculamos el error cuadrado medio
print('Error cuadrado medio: %.2f' % mean_squared_error(y_test, y_pred))

# Calculamos el coeficiente de determinación: 1 es predicción perfecta
print('Coeficiente de determinación: %.2f' % r2_score(y_test, y_pred))


Coeficientes: 
 [-33.92194621555638, array([2.85248393, 1.0169882 , 0.60861668, 0.47694148, 0.19183144])]
Error cuadrado medio: 4.08
Coeficiente de determinación: 0.99


## Entrenanamiento - Rendimiento previo & Horas de estudio

In [12]:
# Separamos la información
X = df.drop(columns=['Performance Index', 'Sample Question Papers Practiced', 'Sleep Hours'])
y = df['Performance Index']

In [13]:
# Entrenamos el modelo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)

## Evaluación de resultados

In [15]:
# Evaluación de predicción
y_pred = regr.predict(X_test)

In [16]:
# Valores relaes vs. predicción
pd.DataFrame({
    'Actual Performance': y_test,
    'Predicted Performance': y_pred
})

Unnamed: 0,Actual Performance,Predicted Performance
6252,51.0,54.534875
4684,20.0,23.137766
1731,46.0,47.599961
4742,28.0,30.499601
4521,41.0,43.973217
...,...,...
6412,45.0,46.978845
8285,66.0,61.850357
7853,16.0,17.033805
1095,65.0,63.061879


In [17]:
print('Coeficientes: \n', [regr.intercept_, regr.coef_])

# Calculamos el error cuadrado medio
print('Error cuadrado medio: %.2f' % mean_squared_error(y_test, y_pred))

# Calculamos el coeficiente de determinación: 1 es predicción perfecta
print('Coeficiente de determinación: %.2f' % r2_score(y_test, y_pred))


Coeficientes: 
 [-29.949601677254073, array([2.85778552, 1.0173268 , 0.5747638 ])]
Error cuadrado medio: 5.14
Coeficiente de determinación: 0.99
