In [5]:
from google.colab import files
uploaded = files.upload()

Saving StudentPerformance.csv to StudentPerformance (1).csv


In [23]:
!pip install -q langchain_community replicate pandas matplotlib seaborn scikit-learn

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from langchain_community.llms import Replicate
from google.colab import userdata
import textwrap
import os

try:
    api_token = userdata.get('api_project_token')
    if not api_token:
        raise ValueError("Secret 'api_project_token' is empty!")
    if not api_token.startswith('r8_'):
        raise ValueError("Token does not start with 'r8_' — invalid Replicate token.")
    print("SUCCESS: Token loaded from 'api_project_token'")
    print(f"   Length: {len(api_token)} | Preview: {api_token[:10]}...")
except Exception as e:
    print("ERROR: Failed to load 'api_project_token'")
    print("   → Go to: Left Panel → Key Icon (Secrets) → Add secret:")
    print("     Name:  api_project_token")
    print("     Value: r8_your_actual_token_here")
    raise e

# Set environment variable (backup)
os.environ["REPLICATE_API_TOKEN"] = api_token
GRANITE_MODEL = "ibm-granite/granite-3.3-8b-instruct"

llm = Replicate(
    model=GRANITE_MODEL,
    replicate_api_token=api_token,
    temperature=0.2,
    max_new_tokens=512
)
print("SUCCESS: IBM Granite model initialized.")
df = pd.read_csv('StudentPerformance.csv')

print("\n--- DATA HEAD ---")
print(df.head())

print("\n--- DATA INFO ---")
print(df.info())

print("\n--- DATA DESCRIPTION ---")
print(df.describe())

print("\n--- MISSING VALUES ---")
print(df.isnull().sum())

print("\nGenerating EDA plots...")
sns.set_theme(style="whitegrid")

# Plot 1: Performance Index Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Performance Index'], kde=True, bins=30, color='skyblue')
plt.title('Distribution of Performance Index')
plt.xlabel('Performance Index')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("performance_index_distribution.png")
plt.close()
print("   → Saved: performance_index_distribution.png")

# Plot 2: Correlation Heatmap
df_corr = df.copy()
df_corr['Extracurricular Activities'] = df_corr['Extracurricular Activities'].map({'Yes': 1, 'No': 0})
corr_matrix = df_corr.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of All Features')
plt.tight_layout()
plt.savefig("correlation_heatmap.png")
plt.close()
print("   → Saved: correlation_heatmap.png")

# Plot 3: Hours Studied vs Performance
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Hours Studied', y='Performance Index', alpha=0.7, color='teal')
sns.regplot(data=df, x='Hours Studied', y='Performance Index', scatter=False, color='red')
plt.title('Hours Studied vs. Performance Index')
plt.xlabel('Hours Studied')
plt.ylabel('Performance Index')
plt.tight_layout()
plt.savefig("hours_studied_vs_performance.png")
plt.close()
print("   → Saved: hours_studied_vs_performance.png")

# Plot 4: Previous Scores vs Performance
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Previous Scores', y='Performance Index', alpha=0.7, color='purple')
sns.regplot(data=df, x='Previous Scores', y='Performance Index', scatter=False, color='red')
plt.title('Previous Scores vs. Performance Index')
plt.xlabel('Previous Scores')
plt.ylabel('Performance Index')
plt.tight_layout()
plt.savefig("previous_scores_vs_performance.png")
plt.close()
print("   → Saved: previous_scores_vs_performance.png")

# Plot 5: Extracurricular Activities vs Performance
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='Extracurricular Activities', y='Performance Index', palette='Set2')
plt.title('Performance by Extracurricular Activities')
plt.xlabel('Extracurricular Activities')
plt.ylabel('Performance Index')
plt.tight_layout()
plt.savefig("extracurricular_vs_performance.png")
plt.close()
print("   → Saved: extracurricular_vs_performance.png")

print("All 5 EDA plots saved successfully!\n")

eda_prompt = """
You are a friendly data science teaching assistant.
Five EDA plots were created from a student performance dataset:

1. Histogram + KDE of Performance Index
2. Correlation heatmap (all features, including Extracurricular Activities as 0/1)
3. Scatter plot + trend line: Hours Studied vs Performance Index
4. Scatter plot + trend line: Previous Scores vs Performance Index
5. Boxplot: Extracurricular Activities (Yes/No) vs Performance Index

Write 3-4 short, simple sentences that a high school student can understand.
Highlight the two strongest relationships you expect from the plots.
"""

print("Generating EDA summary with IBM Granite...")
try:
    eda_summary = llm.invoke(eda_prompt)
    print("\n=== IBM Granite – EDA Summary ===")
    print(textwrap.fill(eda_summary.strip(), width=80))
    print("=== End of Summary ===\n")
except Exception as e:
    print("Failed to generate EDA summary:", str(e))

print("--- Building Predictive Model ---")

# Preprocess: Convert categorical to numeric
df_processed = pd.get_dummies(df, columns=['Extracurricular Activities'], drop_first=True)

# Features and target
X = df_processed.drop('Performance Index', axis=1)
y = df_processed['Performance Index']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training on {len(X_train)} samples, testing on {len(X_test)} samples.")

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\n--- Model Evaluation ---")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R²) Score: {r2:.2f}")

model_prompt = f"""
You are a teaching assistant explaining results to a school principal.
A linear regression model predicts student Performance Index.

- MSE = {mse:.2f} (lower = better predictions)
- R² = {r2:.2f} (closer to 1 = better fit)
- Features used: {', '.join(X.columns)}

Write exactly 2 short, clear sentences:
1. How accurate and reliable is this model?
2. Which single factor matters the most for student performance?
"""

print("Generating model explanation with IBM Granite...")
try:
    model_explain = llm.invoke(model_prompt)
    print("\n=== IBM Granite – Model Explanation ===")
    print(textwrap.fill(model_explain.strip(), width=80))
    print("=== End of Explanation ===\n")
except Exception as e:
    print("Failed to generate model explanation:", str(e))

print("PROJECT COMPLETE!")
print("   • 5 EDA plots saved")
print("   • Linear model: MSE =", round(mse, 2), "| R² =", round(r2, 3))
print("   • IBM Granite summaries generated (if token valid)")
print("\nCheck the Files panel (left) to download your plots!")

                    Please confirm that temperature is what you intended.
                    Please confirm that max_new_tokens is what you intended.


SUCCESS: Token loaded from 'api_project_token'
   Length: 40 | Preview: r8_O8wrU5o...
SUCCESS: IBM Granite model initialized.

--- DATA HEAD ---
   Hours Studied  Previous Scores Extracurricular Activities  Sleep Hours  \
0              7               99                        Yes            9   
1              4               82                         No            4   
2              8               51                        Yes            7   
3              5               52                        Yes            5   
4              7               75                         No            8   

   Sample Question Papers Practiced  Performance Index  
0                                 1               91.0  
1                                 2               65.0  
2                                 2               45.0  
3                                 2               36.0  
4                                 5               66.0  

--- DATA INFO ---
<class 'pandas.core.frame.DataF


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(data=df, x='Extracurricular Activities', y='Performance Index', palette='Set2')


   → Saved: extracurricular_vs_performance.png
All 5 EDA plots saved successfully!

Generating EDA summary with IBM Granite...
Failed to generate EDA summary: ReplicateError Details:
title: Unauthenticated
status: 401
detail: You did not pass a valid authentication token
--- Building Predictive Model ---
Training on 8000 samples, testing on 2000 samples.

--- Model Evaluation ---
Mean Squared Error (MSE): 4.08
R-squared (R²) Score: 0.99
Generating model explanation with IBM Granite...
Failed to generate model explanation: ReplicateError Details:
title: Unauthenticated
status: 401
detail: You did not pass a valid authentication token
PROJECT COMPLETE!
   • 5 EDA plots saved
   • Linear model: MSE = 4.08 | R² = 0.989
   • IBM Granite summaries generated (if token valid)

Check the Files panel (left) to download your plots!
