In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [30]:
from sklearn.preprocessing import LabelEncoder

In [31]:
df = pd.read_csv('/content/Student_Performance.csv')
df.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [32]:
df['Extracurricular Activities'].replace(('Yes', 'No'), (1, 0), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Extracurricular Activities'].replace(('Yes', 'No'), (1, 0), inplace=True)
  df['Extracurricular Activities'].replace(('Yes', 'No'), (1, 0), inplace=True)


In [33]:
df

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,1,4,2,23.0
9996,7,64,1,8,5,58.0
9997,6,83,1,8,5,74.0
9998,9,97,1,7,0,95.0


In [34]:
df.shape

(10000, 6)

In [35]:
df.describe()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4.9929,69.4457,0.4948,6.5306,4.5833,55.2248
std,2.589309,17.343152,0.499998,1.695863,2.867348,19.212558
min,1.0,40.0,0.0,4.0,0.0,10.0
25%,3.0,54.0,0.0,5.0,2.0,40.0
50%,5.0,69.0,0.0,7.0,5.0,55.0
75%,7.0,85.0,1.0,8.0,7.0,71.0
max,9.0,99.0,1.0,9.0,9.0,100.0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  int64  
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(5)
memory usage: 468.9 KB


In [37]:
df.columns

Index(['Hours Studied', 'Previous Scores', 'Extracurricular Activities',
       'Sleep Hours', 'Sample Question Papers Practiced', 'Performance Index'],
      dtype='object')

In [38]:
X = df.drop('Performance Index', axis=1)
y = df['Performance Index']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [40]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7000, 5)
(3000, 5)
(7000,)
(3000,)


In [44]:
import numpy as np

class MultipleLinearRegression:
    def __init__(self, learning_rate=0.01, iterations=1000):
        self.learning_rate = learning_rate
        self.iterations = iterations
        self.weights = None  # Coefficients (m1, m2, ..., mn)
        self.bias = 0  # Intercept (b)

    def fit(self, X, y):
        """
        Train the model using Gradient Descent
        """
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)

        for _ in range(self.iterations):
            y_pred = np.dot(X, self.weights) + self.bias
            error = y_pred - y

            # Compute gradients
            dw = (2 / n_samples) * np.dot(X.T, error)
            db = (2 / n_samples) * np.sum(error)

            # Update parameters
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        """
        Make predictions for given input X
        """
        return np.dot(X, self.weights) + self.bias

    def mean_squared_error(self, y_true, y_pred):
        """
        Compute Mean Squared Error (MSE)
        """
        return np.mean((y_true - y_pred) ** 2)

# Example Usage:
if __name__ == "__main__":
    model = MultipleLinearRegression(learning_rate=0.01, iterations=1000)

In [45]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [46]:
y_pred = model.predict(X_test)
mse = model.mean_squared_error(y_test, y_pred)

print("Predictions:", y_pred)
print("Mean Squared Error:", mse)

Predictions: [1315.08732461  888.13995428 1102.14019511 ...  945.06861886 1602.32504531
  864.45002374]
Mean Squared Error: 1675531.076674396


In [47]:
pd.DataFrame({'pred': y_pred})

Unnamed: 0,pred
0,1315.087325
1,888.139954
2,1102.140195
3,849.015473
4,1047.128948
...,...
2995,849.829343
2996,1136.260602
2997,945.068619
2998,1602.325045
