### ðŸ““ Python for ML (Core) â€“ Day 2
### Focus: Core Python Tools for Data Handling


### 1. NumPy Arrays and Vector Operations

In [1]:
import numpy as np

# Creating arrays
arr = np.array([1, 2, 3, 4, 5])
print("Array:", arr)

# Basic operations
print("Addition:", arr + 2)
print("Multiplication:", arr * 3)

# Dot product
a = np.array([1, 2])
b = np.array([3, 4])
print("Dot product:", np.dot(a, b))

# Broadcasting
matrix = np.ones((3,3))
print("Broadcasting example:\n", matrix + arr[:3])

Array: [1 2 3 4 5]
Addition: [3 4 5 6 7]
Multiplication: [ 3  6  9 12 15]
Dot product: 11
Broadcasting example:
 [[2. 3. 4.]
 [2. 3. 4.]
 [2. 3. 4.]]


### 2. Pandas: DataFrames and Series

In [5]:
import pandas as pd

series = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
print("Series:")
print(series)

data = {
    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35]
}
df = pd.DataFrame(data)
print("\nDataFrame:")
print(df)

print("\nSingle Column (Name):")
print(df['Name'])

print("\nSingle Row (index 0):")
print(df.loc[0])

print("\nAverage Age:")
print(df['Age'].mean())

print("\nData Description:")
print(df.describe())


Series:
a    10
b    20
c    30
dtype: int64

DataFrame:
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35

Single Column (Name):
0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object

Single Row (index 0):
Name    Alice
Age        25
Name: 0, dtype: object

Average Age:
30.0

Data Description:
        Age
count   3.0
mean   30.0
std     5.0
min    25.0
25%    27.5
50%    30.0
75%    32.5
max    35.0


### 3. Data Loading from CSV Files

In [8]:
import pandas as pd

df = pd.read_csv("Student_Performance.csv")

print("\nFirst 5 rows:")
print(df.head())

print("\nDataset info:")
print(df.info())

print("\nDataset shape (rows, columns):")
print(df.shape)

# Fill missing values in ColumnX with its mean
if 'ColumnX' in df.columns:
    df['ColumnX'] = df['ColumnX'].fillna(df['ColumnX'].mean())

# Drop any remaining rows that still contain NaN
df = df.dropna()

print("\nAfter handling missing values:")
print(df.info())



First 5 rows:
   student_id  age  gender school_type parent_education  study_hours  \
0           1   14    male      public    post graduate          3.1   
1           2   18  female      public         graduate          3.7   
2           3   17  female     private    post graduate          7.9   
3           4   16   other      public      high school          1.1   
4           5   16  female      public      high school          1.3   

   attendance_percentage internet_access travel_time extra_activities  \
0                   84.3             yes     <15 min              yes   
1                   87.8             yes     >60 min               no   
2                   65.5              no     <15 min               no   
3                   58.1              no   15-30 min               no   
4                   61.0             yes   30-60 min              yes   

  study_method  math_score  science_score  english_score  overall_score  \
0        notes        42.7           5

### 4. Basic Statistical Analysis

In [12]:
import pandas as pd

print("Mean of age:", df['age'].mean())        # average value
print("Median of age:", df['age'].median())    # middle value
print("Mode of age:", df['age'].mode().values) # most frequent value(s)

print("Standard deviation of age:", df['age'].std()) # spread around mean
print("Variance of age:", df['age'].var())          # std squared

print("\nCorrelation matrix (numeric only):")
print(df.select_dtypes(include='number').corr())


Mean of age: 16.48276
Median of age: 16.0
Mode of age: [17]
Standard deviation of age: 1.7038952176576208
Variance of age: 2.9032589127565105

Correlation matrix (numeric only):
                       student_id       age  study_hours  \
student_id               1.000000  0.003583     0.003560   
age                      0.003583  1.000000    -0.003722   
study_hours              0.003560 -0.003722     1.000000   
attendance_percentage    0.005273  0.006050    -0.005046   
math_score              -0.004801 -0.005234     0.802321   
science_score            0.005630  0.001638     0.805073   
english_score            0.001506 -0.008716     0.803718   
overall_score           -0.001538 -0.003847     0.905771   

                       attendance_percentage  math_score  science_score  \
student_id                          0.005273   -0.004801       0.005630   
age                                 0.006050   -0.005234       0.001638   
study_hours                        -0.005046    0.802321

### 5. Data Cleaning Workflow

In [14]:
import pandas as pd

# Load data
df = pd.read_csv("Student_Performance.csv")

# 1. Remove duplicates
df = df.drop_duplicates()

# 2. Handle missing values
df = df.fillna({
    'age': df['age'].mean(),
    'Salary': 0
})

# 3. Rename columns
df.rename(columns={'Name': 'Full_Name'}, inplace=True)

# 4. Convert data types
df['age'] = df['age'].astype(int)

# Quick check
print(df.head())
print(df.info())


   student_id  age  gender school_type parent_education  study_hours  \
0           1   14    male      public    post graduate          3.1   
1           2   18  female      public         graduate          3.7   
2           3   17  female     private    post graduate          7.9   
3           4   16   other      public      high school          1.1   
4           5   16  female      public      high school          1.3   

   attendance_percentage internet_access travel_time extra_activities  \
0                   84.3             yes     <15 min              yes   
1                   87.8             yes     >60 min               no   
2                   65.5              no     <15 min               no   
3                   58.1              no   15-30 min               no   
4                   61.0             yes   30-60 min              yes   

  study_method  math_score  science_score  english_score  overall_score  \
0        notes        42.7           55.4           5