In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/Aniket-bit7/learning-analytics-ml-system/main/data/students_data.csv"

df = pd.read_csv(url)

df.head()

In [None]:
df["Final_Result"].unique()

In [None]:
print("Shape:", df.shape)

In [None]:
df.info()

In [None]:
# dropping Student_ID from modeling as it has no predictive meaning and it may introduce noise
df = df.drop(columns=["Student_ID"])

In [None]:
# checking missing values -->
df.isnull().sum()

In [None]:
# checking duplicates -->
df.duplicated().sum()

In [None]:
df.describe()

In [None]:
# visualization part -->
import matplotlib.pyplot as plt
import seaborn as sns

df.hist(figsize=(12, 8))
plt.tight_layout()
plt.show()

In [None]:
sns.countplot(x="Final_Result", data=df)
plt.title("Final Result Distribution")
plt.show()

df["Final_Result"].value_counts()

In [None]:
sns.boxplot(x="Final_Result", y="Attendance", data=df)
plt.show()

# Handling Missing Values


In [None]:
# before handling missing values -->
print(df.isnull().sum())

df["Quiz2"].fillna(df["Quiz2"].mean(), inplace=True)
df["Time_Spent"].fillna(df["Time_Spent"].mean(), inplace=True)
df["Attendance"].fillna(df["Attendance"].mean(), inplace=True)

# after handling missing values -->
print(df.isnull().sum())

# Encoding and visualization

In [None]:
# Encode the 'Final_Result' column to numerical values
df['Final_Result_Encoded'] = df['Final_Result'].map({'Pass': 1, 'Fail': 0})

# correlation heatmap -->
plt.figure(figsize=(8, 6))
sns.heatmap(df.corr(numeric_only=True), annot=True, cmap="coolwarm")
plt.show()

# Feature Engineering

In [35]:
#Total Quiz Score -->
df["Total_Quiz_Score"] = df["Quiz1"] + df["Quiz2"] + df["Quiz3"]

In [36]:
#Average Quiz Score -->
df["Average_Quiz_Score"] = df["Total_Quiz_Score"] / 3

In [38]:
# Quiz_Consistency -->
df["Quiz_Std"] = df[["Quiz1", "Quiz2", "Quiz3"]].std(axis=1)