In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [2]:
# Step 2: Create a sample dataset
# In a real-world scenario, you would load this from a CSV file.
# For this example, we'll create it directly using a pandas DataFrame.
data = {
    'study_hours': [2.5, 5.1, 3.2, 8.5, 6.5, 9.2, 5.5, 8.3, 2.7, 7.7, 5.9, 6.1, 4.5, 3.3, 1.1],
    'previous_score': [60, 82, 70, 95, 88, 98, 85, 94, 68, 91, 84, 86, 78, 72, 50],
    'final_mark': [65, 85, 72, 96, 90, 99, 88, 95, 69, 94, 87, 88, 81, 75, 52]
}
df = pd.DataFrame(data)

print("--- Original Data ---")
print(df.head())
print("\n")

--- Original Data ---
   study_hours  previous_score  final_mark
0          2.5              60          65
1          5.1              82          85
2          3.2              70          72
3          8.5              95          96
4          6.5              88          90




In [3]:
# Step 3: Define features (X) and target (y)
X = df[['study_hours', 'previous_score']]
y = df['final_mark']

In [4]:
# Step 4: Split the data into training and testing sets
# We'll use 80% of the data for training and 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Step 5: Scale the features
# It's important to scale data after splitting to prevent data leakage from the test set.
scaler = StandardScaler()

In [6]:
# Fit the scaler on the training data and transform both sets
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [9]:
# Step 6: Build and train the Linear Regression model
# This follows the 4-step Scikit-learn pattern.
model = LinearRegression()
model.fit(X_train_scaled, y_train)
print("--- Model Training Complete ---")
print("The model has learned the relationship between the features and the final mark.\n")


--- Model Training Complete ---
The model has learned the relationship between the features and the final mark.



In [10]:
# Step 7: Make predictions on the test set
# This is the moment of truth: let's see how the model performs on data it has never seen before.
y_pred = model.predict(X_test_scaled)


In [14]:
# Step 8: Evaluate the model's performance
print("--- Model Evaluation ---")
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) # Removed squared=False
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Mean Squared Error (MSE): {rmse:.2f}") # Changed RMSE to MSE as squared=False was removed
print(f"R-squared (R²) Score: {r2:.2f}")
print("\n")

--- Model Evaluation ---
Mean Absolute Error (MAE): 1.68
Mean Squared Error (MSE): 4.44
R-squared (R²) Score: 0.97




In [15]:
# Step 9: Use the trained model to predict on new, unseen data
print("--- Predicting on a New Student ---")
# Let's say a new student studied for 7 hours and had a previous score of 92.
new_student_data = np.array([[7.0, 92.0]])

# We MUST apply the same scaling to this new data
new_student_data_scaled = scaler.transform(new_student_data)

# Predict the final mark
predicted_mark = model.predict(new_student_data_scaled)

print(f"Data for new student: Study Hours=7.0, Previous Score=92")
print(f"Predicted Final Mark: {predicted_mark[0]:.2f}")

--- Predicting on a New Student ---
Data for new student: Study Hours=7.0, Previous Score=92
Predicted Final Mark: 94.25


