In [44]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [45]:
np.random.seed(42)

num_samples = 500

hours_studied = np.random.uniform(1, 10, num_samples)  
practice_tests_taken = np.random.randint(0, 10, num_samples)
attendance_rate = np.random.uniform(0, 1, num_samples)  
sleep_hours = np.random.uniform(4, 10, num_samples)  


exam_score = (0.4 * hours_studied +
              2 * practice_tests_taken +
              30 * attendance_rate +
              5 * sleep_hours +
            np.random.normal(0, 5, num_samples))


data = pd.DataFrame({
    'Hours_Studied': hours_studied,
    'Practice_Tests_Taken': practice_tests_taken,
    'Attendance_Rate': attendance_rate,
    'Sleep_Hours': sleep_hours,
    'Exam_Score': exam_score
})


print(data.head())

data.to_csv('exam_scores_dataset.csv', index=False)


   Hours_Studied  Practice_Tests_Taken  Attendance_Rate  Sleep_Hours  \
0       4.370861                     5         0.799916     7.940420   
1       9.556429                     0         0.178545     5.951140   
2       7.587945                     8         0.652746     8.640839   
3       6.387926                     0         0.238183     4.785242   
4       2.404168                     4         0.099441     9.818926   

   Exam_Score  
0   74.755647  
1   32.813122  
2   80.776639  
3   29.374262  
4   58.136923  


In [46]:
data = pd.read_csv("exam_scores_dataset.csv")
df = pd.DataFrame(data)
df

Unnamed: 0,Hours_Studied,Practice_Tests_Taken,Attendance_Rate,Sleep_Hours,Exam_Score
0,4.370861,5,0.799916,7.940420,74.755647
1,9.556429,0,0.178545,5.951140,32.813122
2,7.587945,8,0.652746,8.640839,80.776639
3,6.387926,0,0.238183,4.785242,29.374262
4,2.404168,4,0.099441,9.818926,58.136923
...,...,...,...,...,...
495,4.180170,7,0.592951,7.390132,77.283444
496,6.252905,6,0.163524,8.133312,49.395066
497,1.699612,7,0.391082,9.239937,88.300721
498,9.769553,5,0.969412,7.817748,87.359216


In [47]:
df.isnull().sum()

Hours_Studied           0
Practice_Tests_Taken    0
Attendance_Rate         0
Sleep_Hours             0
Exam_Score              0
dtype: int64

In [48]:
X = df.iloc[:,0:4]
y = df.iloc[:,-1]
X

Unnamed: 0,Hours_Studied,Practice_Tests_Taken,Attendance_Rate,Sleep_Hours
0,4.370861,5,0.799916,7.940420
1,9.556429,0,0.178545,5.951140
2,7.587945,8,0.652746,8.640839
3,6.387926,0,0.238183,4.785242
4,2.404168,4,0.099441,9.818926
...,...,...,...,...
495,4.180170,7,0.592951,7.390132
496,6.252905,6,0.163524,8.133312
497,1.699612,7,0.391082,9.239937
498,9.769553,5,0.969412,7.817748


In [49]:
x_train,x_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=8)

In [50]:
lr = LinearRegression()
lr.fit(x_train,y_train)

In [51]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [52]:
y_pred = lr.predict(x_test)
y_pred

array([80.67742464, 57.71127913, 51.21126441, 55.27373958, 57.12186437,
       31.91308009, 54.96429302, 78.09920659, 80.4569892 , 64.35023232,
       77.63453769, 72.98387472, 46.13753986, 78.20422638, 52.87592601,
       51.06156751, 69.75176757, 62.35683477, 46.99476333, 72.37238706,
       82.01166884, 59.25354971, 75.71700991, 62.05454972, 43.74427964,
       55.37458086, 54.19282734, 66.73775615, 43.48170421, 57.12697705,
       65.98844195, 62.56064966, 33.10981968, 80.48479749, 41.13029411,
       50.85901716, 57.33609323, 52.67511292, 71.69451563, 47.47268038,
       56.66550098, 67.90671952, 59.46472895, 37.23239767, 64.04419523,
       61.9580252 , 89.48896581, 70.88851754, 41.16160878, 54.7274882 ,
       73.30545022, 65.22581014, 48.94154557, 50.86170307, 77.60352956,
       55.30799494, 64.25211352, 60.10857069, 65.30248217, 69.5744378 ,
       48.62149641, 58.05477905, 54.16774541, 71.91598371, 51.37050452,
       78.20468839, 53.68424341, 66.57401529, 60.62949437, 53.69

In [53]:
y_test.values

array([84.14704419, 65.60483973, 49.93708636, 64.6827848 , 52.65689069,
       32.39919153, 48.58393182, 76.10545927, 81.0666768 , 65.67790293,
       81.03236283, 74.36920173, 37.99171484, 79.73957065, 44.56055625,
       47.35704215, 75.30851685, 68.45676701, 46.44776529, 77.31394354,
       79.49181104, 63.8368552 , 74.75564701, 67.70930566, 34.84669773,
       55.32170021, 59.50409964, 64.28734494, 54.3790717 , 54.324372  ,
       72.70922411, 72.50043556, 27.52197531, 78.1013492 , 45.20490381,
       54.70525436, 59.45000337, 52.7965827 , 76.44541237, 41.50279567,
       62.80706086, 52.57717147, 67.34897164, 32.57104753, 73.26100747,
       63.08048831, 89.74753092, 71.06807577, 43.68088986, 60.43995291,
       73.91233354, 62.67055118, 45.93466201, 53.08767816, 83.91708322,
       53.05434844, 55.56940249, 59.54618616, 56.57618054, 69.9598959 ,
       38.86101993, 59.25291635, 58.87864737, 76.08811335, 47.39745173,
       72.92471777, 53.44977623, 67.12280272, 64.30529349, 57.92

In [54]:
print("MAE",mean_absolute_error(y_test,y_pred))

MAE 3.894098960506995


In [55]:
print("MAE",mean_absolute_error(y_test,y_pred))

MAE 3.894098960506995


In [56]:
print('MSE', r2_score(y_test,y_pred))
r2 = r2_score(y_test,y_pred)
r2

MSE 0.8668571851527561


0.8668571851527561

In [57]:
import pickle
with open('dataset.pkl', 'wb') as file:
    pickle.dump(lr, file)

print("Dataset saved to 'dataset.pkl'")

Dataset saved to 'dataset.pkl'
