<style>
  h3 {
    color: green;
    font-weight: bold;
  }
</style>

### Author: Chetand777

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

#### 1) Data Understanding & Preprocessing

In [2]:
# Reading the data from url
url = 'https://raw.githubusercontent.com/AdiPersonalWorks/Random/master/student_scores%20-%20student_scores.csv' 
data = pd.read_csv(url)
data.head()

Unnamed: 0,Hours,Scores
0,2.5,21
1,5.1,47
2,3.2,27
3,8.5,75
4,3.5,30


In [3]:
# Inspecting the data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Hours   25 non-null     float64
 1   Scores  25 non-null     int64  
dtypes: float64(1), int64(1)
memory usage: 532.0 bytes


In [4]:
# Checking for null values present
data.isnull().sum()

Hours     0
Scores    0
dtype: int64

In [5]:
# Checking the statistical distribution of data
data.describe()

Unnamed: 0,Hours,Scores
count,25.0,25.0
mean,5.012,51.48
std,2.525094,25.286887
min,1.1,17.0
25%,2.7,30.0
50%,4.8,47.0
75%,7.4,75.0
max,9.2,95.0


* Data available for 25 students
* Avg hrs studied by students are 5
* Avg marks scored by students is 51
* Min marks scored by students is 17
* Max marks scored by students is 95

In [6]:
# Checking the shape of data
data.shape

(25, 2)

#### 2) EDA

In [7]:
fig = px.scatter(data, x='Hours', y='Scores', opacity=1, size=data['Hours'], color=data['Hours'])  # Adjust size as needed

fig.update_layout(
    title='Scatter plot of Hours vs Scores',
    xaxis_title='Hours Studied',
    yaxis_title='Scores',
    width=800,
    height=500
)

fig.show()

* Positive correlation between predictor and target variable

#### 3) Model Building & Training

In [8]:
# Splitting the data into train-test 
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# Splitting the data into train-test
X = data[['Hours']]
y = data[['Scores']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Training the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predicting the results
y_pred = model.predict(X_test)

X_test_flat = X_test.values.flatten()
y_test_flat = y_test.values.flatten()
y_pred_flat = y_pred.reshape(-1, 1).flatten()

# Plotting the scatter plot with regression line
fig = px.scatter(x=X_test_flat, y=y_test_flat, size=X_test_flat, color=X_test_flat, labels={'x':'Independent Var (X)', 'y':'Target Var (y)'})

fig.add_trace(
    go.Scatter(
        x=X_test_flat,
        y=y_pred_flat,
        mode='lines',
        name='Regression Line (Prediction)',
        line=dict(color='red')
    )
)

fig.update_layout(
    title='Scatter plot of with regression line',
    xaxis_title='Independent Var (X_test)',
    yaxis_title='Target Var (y_test)',
    width=800,
    height=500
)

fig.show()

In [9]:
# Create a df to stotre actual and predicted values
df = pd.DataFrame({'Actual': y_test_flat, 'Predicted': y_pred_flat})
df

Unnamed: 0,Actual,Predicted
0,81,83.188141
1,30,27.032088
2,21,27.032088
3,76,69.633232
4,62,59.951153


In [10]:
# Plotting bar chart to visualize
fig = go.Figure()

fig.add_trace(go.Bar(
    x=df.index,
    y=df['Actual'],
    name='Actual',
    marker_color='blue'
))

fig.add_trace(go.Bar(
    x=df.index,
    y=df['Predicted'],
    name='Predicted',
    marker_color='red'
))

fig.update_layout(
    title='Actual vs Predicted Values',
    xaxis_title='Index',
    yaxis_title='Scores',
    barmode='group',
    width=800,
    height=500,
    xaxis=dict(
        gridcolor='red',
        gridwidth=0.5
    ),
    yaxis=dict(
        gridcolor='red',
        gridwidth=0.5,
        minor=dict(
            gridcolor='blue',
            gridwidth=0.5
        )
    )
)

fig.show()

#### 4) Model Evaluation

In [11]:
# Evaluating model's performance 
from sklearn.metrics import *
print(f'Mean Squared Error: {mean_squared_error(y_test_flat, y_pred_flat)}')
print(f'Mean Absolute Error: {mean_absolute_error(y_test_flat, y_pred_flat)}')
print(f'R2 Score: {r2_score(y_test_flat, y_pred_flat)}')

Mean Squared Error: 18.943211722315272
Mean Absolute Error: 3.9207511902099244
R2 Score: 0.9678055545167994


In [12]:
# Testing model's performance on own data
hours = 9.25
test = np.array([hours]).reshape(-1,1)
own_pred = model.predict(test)
print(f'No of hours studied: {hours}')
print(f'Marks Scored: {own_pred}')

No of hours studied: 9.25
Marks Scored: [[92.38611528]]



X does not have valid feature names, but LinearRegression was fitted with feature names



<style>
  h3 {
    color: green;
    font-weight: bold;
  }
</style>

### The low MSE and MAE values, combined with a high R2 score, suggest that the linear regression model performs very well. It accurately predicts the scores based on the hours studied, with minimal error and a high proportion of explained variance.