<a href="https://colab.research.google.com/github/Addychauhan/Data-Analysis/blob/main/Student_Placement_Package_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing the required Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

# Load the Dataset

In [None]:
data=pd.read_csv("/content/updated_placement.csv")

In [None]:
data

In [None]:
data.head(3)

In [None]:
data.tail(3)

In [None]:
data.isnull().sum()

# Basic Exoloration

In [None]:
print("Dataset Information\n")
data.info()

In [None]:
print("Dataset Description\n")
data.describe()

# Data Visualization

In [None]:
#Scatter Plot
plt.figure(figsize=(12,10))
plt.scatter(data['cgpa'], data['package'])
plt.xlabel('CGPA')
plt.ylabel('Package')
plt.show()

In [None]:
data.hist(figsize=(10,8))
plt.show()

In [None]:
data.corr()

# Splitting Features and Traget

In [None]:
x=data['cgpa']
y=data['package']

In [None]:
x

In [None]:
y

# Splitting Data

In [None]:
x_train, x_test, y_train, y_test=train_test_split(x,y, test_size=0.2, random_state=42)

In [None]:
x_train

In [None]:
y_train

# Model Building

In [None]:
model=LinearRegression()

In [None]:
model

In [None]:
#Train the model
model.fit(np.array(x_train).reshape(-1,1), y_train)

# Prediction and Evaluation of the Model

**Prediction of the model**

In [None]:
y_pred=model.predict(np.array(x_test).reshape(-1,1))

In [None]:
x_test

In [None]:
y_pred

In [None]:
y_test

**Evaluate Model**

In [None]:
mse=mean_squared_error(y_test, y_pred)
mse

In [None]:
mae=mean_absolute_error(y_test, y_pred)
mae

In [None]:
rmse=root_mean_squared_error(y_test, y_pred)
rmse

In [None]:
r2=r2_score(y_test, y_pred)
r2

In [None]:
print("Mean Squared Error (MSE):", mse)
print("Root MSE (RMSE):", rmse)                 #Lower MSE/RMSE → predictions close to actual salaries
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R²):", r2)

**Cross-Validation**

In [None]:
# There will be an error occurs because cross_val_score expects a 2-dimensional input for x_train,
# but it received a 1-dimensional pandas Series. You need to reshape x_train into a 2-dimensional array,
# for example, by using .reshape(-1, 1).



from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    model,
    np.array(x_train).reshape(-1,1),
    y_train,
    cv=5,
    scoring='r2'
)

print(scores)
print("Mean CV R2:", scores.mean())

The cross-validation R2 scores for each fold are [0.91941856, 0.98374798, 0.9860206, 0.95617596, 0.97276137]. The mean R2 score from the cross-validation is 0.9636248920341934. This indicates that the model is performing consistently well across different subsets of the training data, with a high average R2 score. This is a good sign for the model's generalization ability.

# Visualize Predictions and Actual Package

In [None]:
#Plot CGPA vs Package
plt.figure(figsize=(12,10))
plt.scatter(x_train, y_train, label="Training Dataset", color='blue')
plt.scatter(x_test, y_test, label="Actual Test Data", color='green')
plt.scatter(x_test, y_pred, label="Predicted Data", color='red', marker='x', s=100)
plt.xlabel('CGPA')
plt.ylabel("Package")
plt.title('Student Placement Package Record: Actual vs Predicted')
plt.legend()
plt.show()

In [None]:
#Plot CGPA vs Package
plt.figure(figsize=(12,10))
plt.scatter(x_train, y_train, label="Training Dataset", color='blue')
plt.scatter(x_test, y_test, label="Actual Test Data", color='green')
plt.plot(x_test, y_pred, color='red', label="Regression Line")
plt.xlabel('CGPA')
plt.ylabel("Package")
plt.title('Student Placement Package Record: Actual vs Predicted')
plt.legend()
plt.show()

In [None]:
#Visualize Actual vs Predicted Packages
plt.figure(figsize=(10,8))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([y_test.min(), y_test.max()],[y_test.min(), y_test.max()], linestyle='--', color='red')
plt.xlabel('Actual Package')
plt.ylabel('Predicted Package')
plt.title('Actual vs Predicted Packages')
plt.show()

In [None]:
# Visualize Actual vs Predicted Packages by Sample Index
plt.figure(figsize=(12, 8))
plt.plot(range(len(y_test)), y_test, 'o-', label='Actual Package', color='green')
plt.plot(range(len(y_pred)), y_pred, 'x-', label='Predicted Package', color='red')
plt.xlabel('Sample Index')
plt.ylabel('Package')
plt.title('Actual vs Predicted Packages by Sample')
plt.legend()
plt.grid(True)
plt.show()

# Predicted Package of New Student

In [None]:
student1=np.array([8.5]).reshape(-1,1)
package1=model.predict(student1)
print(package1)

In [None]:
student2=[[5.6]]
package2=model.predict(student2)
print(package2)

In [None]:
student3=np.array([8.9])
package3=model.predict(student3.reshape(-1,1))
print(package3)

In [None]:
student4=np.array([[9.4]])
package4=model.predict(student4)
print(package4)

In [None]:
student5=np.array([[6.7]])
package5=model.predict(student5)
print(package5)

# Saving the model

In [None]:
import joblib
joblib.dump(model, 'package_predictor.joblib')

In [None]:
joblib.load('package_predictor.joblib')

✅ Conclusion
This project demonstrates how Linear Regression can be used to predict student placement packages based on CGPA. The model performs well and can be used for future predictions.