In [3]:
# Importing Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Step 1:Load the Dataset
url="https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip"
dataset_path = "student-mat.csv"

#Download and Load the Dataset
import urllib.request
import zipfile

#Download the Dataset
urllib.request.urlretrieve(url, "student.zip")

#Extract the Dataset
with zipfile.ZipFile("student.zip","r") as zip_ref:
    zip_ref.extract.all(".")

# Load the data into a DataFrame
data = pd.read_csv("student-mat.csv", sep=";")
print("Data loaded successfully!")

# Step 2: Data Exploration
print(data.head())       # Display the first few rows
print("\nDataset Info:")
print(data.info())       # Check data types and missing values

# Step 3: Data Cleaning
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Remove duplicates
data = data.drop_duplicates()

# Step 4: Data Analysis
# Question 1: What is the average score in math (G3)?
average_score = data['G3'].mean()
print(f"\nAverage Math Score (G3): {average_score:.2f}")

# Question 2: How many students scored above 15 in their final grade (G3)?
students_above_15 = len(data[data['G3'] > 15])
print(f"Number of students scoring above 15: {students_above_15}")

# Question 3: Is there a correlation between study time and final grade?
correlation = data['studytime'].corr(data['G3'])
print(f"Correlation between study time and final grade: {correlation:.2f}")

# Question 4: Which gender has a higher average final grade?
average_grade_by_gender = data.groupby('sex')['G3'].mean()
print("\nAverage Final Grade by Gender:")
print(average_grade_by_gender)

# Step 5: Data Visualization
# Histogram of final grades
plt.figure(figsize=(8, 5))
plt.hist(data['G3'], bins=10, color='skyblue', edgecolor='black')
plt.title("Distribution of Final Grades (G3)")
plt.xlabel("Final Grade")
plt.ylabel("Frequency")
plt.show()

# Scatter plot of study time vs. final grade
plt.figure(figsize=(8, 5))
sns.scatterplot(data=data, x='studytime', y='G3', hue='sex')
plt.title("Study Time vs Final Grade")
plt.xlabel("Study Time (hours)")
plt.ylabel("Final Grade")
plt.legend(title="Gender")
plt.show()

# Bar chart of average scores by gender
plt.figure(figsize=(8, 5))
average_grade_by_gender.plot(kind='bar', color=['blue', 'pink'])
plt.title("Average Final Grade by Gender")
plt.ylabel("Average Final Grade")
plt.xlabel("Gender")
plt.xticks(rotation=0)
plt.show()



ModuleNotFoundError: No module named 'pandas'