# Analysis (Option B: sqlite3)
Load the transformed dataset and run descriptive analytics + simple predictions.

In [None]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

data_path = Path('outputs/transformed.csv')

df = pd.read_csv(data_path)
print(df.head())
print("\nRows:", len(df))

# Basic descriptive stats
num_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]
display(df[num_cols].describe())

# Plot Marks vs Effort_Hours if available
if 'Marks' in df.columns and 'Effort_Hours' in df.columns:
    plt.figure()
    plt.scatter(df['Effort_Hours'], df['Marks'])
    plt.xlabel('Effort_Hours'); plt.ylabel('Marks'); plt.title('Marks vs Effort_Hours')
    plt.show()

# Simple predictions for three students at effort = 10 hours
TARGET_EFFORT = np.array([[10.0]])
candidates = ['SID20131151','SID20149500','SID20182516']

pred_rows = []
for sid in candidates:
    if 'Student_ID' in df.columns and 'Marks' in df.columns and 'Effort_Hours' in df.columns:
        hist = df[(df['Student_ID'] == sid) & df['Effort_Hours'].notna() & df['Marks'].notna()]
        if len(hist) >= 2:
            X = hist[['Effort_Hours']].astype(float).values
            y = hist['Marks'].astype(float).values
            model = LinearRegression().fit(X, y)
            pred = float(model.predict(TARGET_EFFORT))
            pred_rows.append({'Student_ID': sid, 'Predicted_Marks_at_10h': round(pred,2)})
        else:
            pred_rows.append({'Student_ID': sid, 'Predicted_Marks_at_10h': 'insufficient history'})
    else:
        pred_rows.append({'Student_ID': sid, 'Predicted_Marks_at_10h': 'required columns missing'})

pred_df = pd.DataFrame(pred_rows)
display(pred_df)