##**Import Required Libraries**

In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import LabelEncoder

This imports libraries for data manipulation (pandas, numpy), machine learning preprocessing (sklearn), and deep learning (PyTorch).

##**Load the Dataset**

In [None]:
# Step 2: Load your dataset
df = pd.read_csv('/content/youtube_khan_academy.csv')

Loads the dataset containing YouTube video metadata related to Khan Academy.

##**Explore the Dataset**

In [None]:
df.head()

Unnamed: 0,channel_title,videoId,title,title_word_count,published_at,date_diff,view_count,like_count,dislike_count,reaction_total,...,desc_coleman_liau_index,desc_automated_readability_index,desc_dale_chall_readability_score,desc_difficult_words,desc_linsear_write_formula,desc_gunning_fog,desc_text_standard,publish_year,publish_week_month,publish_week_start_date
0,Khan Academy,G5FVxAzlmfM,Weak acid–weak base reactions | Acids and base...,14,2021-08-11T00:42:40Z,3,6444,77,3,80,...,19.73,22.4,9.14,51,16.5,14.65,14th and 15th grade,2021,Aug-21,09/08/2021
1,Khan Academy,78Sg3RdEPVk,Weak base–strong acid reactions | Acids and ba...,14,2021-08-11T00:45:12Z,3,2089,26,0,26,...,18.22,20.0,8.71,53,13.2,13.5,13th and 14th grade,2021,Aug-21,09/08/2021
2,Khan Academy,9rRlLPx6w7M,Worked example: Calculating the pH after a wea...,13,2021-08-11T00:47:08Z,3,1646,33,0,33,...,23.09,24.3,9.76,53,14.2,14.79,14th and 15th grade,2021,Aug-21,09/08/2021
3,Khan Academy,VdAl4QK4-0Q,Weak acid–strong base reactions | Acids and ba...,14,2021-08-11T00:40:00Z,3,2507,87,2,89,...,18.1,19.9,8.71,53,13.2,13.5,13th and 14th grade,2021,Aug-21,09/08/2021
4,Khan Academy,SttbEmGj9uM,Strong acid–strong base reactions | Acids and ...,14,2021-08-11T00:36:37Z,3,2479,48,0,48,...,19.38,21.0,8.94,50,13.4,13.38,13th and 14th grade,2021,Aug-21,09/08/2021


Loads the dataset containing YouTube video metadata related to Khan Academy.

In [None]:
df.columns

Index(['channel_title', 'videoId', 'title', 'title_word_count', 'published_at',
       'date_diff', 'view_count', 'like_count', 'dislike_count',
       'reaction_total', 'like_dislike_diff', 'like_percentage',
       'dislike_percentage', 'favorite_count', 'views_per_day',
       'comment_count', 'tokenized', 'live_broadcast_content', 'url',
       'description', 'description_word_count', 'tokenized_description',
       'duration', 'aspect_ratio', 'extract_date', 'channel_id',
       'channel_view_count', 'channel_subscriber_count', 'channel_video_count',
       'title_sentiment_polarity', 'title_sentiment_subjectivity',
       'description_sentiment_polarity', 'description_sentiment_subjectivity',
       'diff_sentiment_polarity', 'diff_sentiment_subjectivity',
       'title_flesch_reading_ease', 'title_smog_index',
       'title_flesch_kincaid_grade', 'title_coleman_liau_index',
       'title_automated_readability_index',
       'title_dale_chall_readability_score', 'title_difficult_

Displays the first few rows and all column names to understand the data structure.


##**Select and Clean Features**

In [None]:
selected_features = [
    'view_count', 'like_count', 'dislike_count', 'reaction_total',
    'like_dislike_diff', 'like_percentage', 'dislike_percentage',
    'comment_count', 'channel_view_count', 'channel_subscriber_count',
    'channel_video_count', 'title_sentiment_polarity',
    'description_sentiment_polarity', 'title_flesch_reading_ease',
    'title_smog_index', 'title_flesch_kincaid_grade',
    'desc_flesch_reading_ease', 'desc_smog_index', 'desc_flesch_kincaid_grade'
]

In [None]:
df = df[selected_features].dropna()

Selects only important features for modeling and removes rows with missing values.

##**Create target label**

In [None]:
def label_difficulty(views):
    if views < 5000:
        return 0  # remedial
    elif views > 20000:
        return 2  # advanced
    else:
        return 1  # standard

df['label'] = df['view_count'].apply(label_difficulty)

Converts view_count into categorical difficulty levels (remedial, standard, advanced).

##**Prepare Inputs and Labels**

In [None]:
X = df[selected_features].values
y = torch.tensor(df['label'].values, dtype=torch.long)

Separates the input features and labels for model training.

##**`Feature Scaling`**

In [None]:
scaler = StandardScaler()
X = torch.tensor(scaler.fit_transform(X), dtype=torch.float32)

Standardizes the input data to improve model performance.

##**Train-Test Split**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Splits the data into training and testing sets.


##**Define Neural Network Model**

In [None]:
class YouTubeContentClassifier(nn.Module):
    def __init__(self, input_dim):
        super(YouTubeContentClassifier, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 3)  # 3 classes
        )

    def forward(self, x):
        return self.net(x)

model = YouTubeContentClassifier(X.shape[1])

Creates a simple feed-forward neural network with 3 output classes (difficulty levels).

##**Define Loss Function and Optimizer**

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

Sets up the loss function and optimization algorithm for training.



##**Train the model**

In [None]:

for epoch in range(50):
    optimizer.zero_grad()
    output = model(X_train)
    loss = criterion(output, y_train)
    loss.backward()
    optimizer.step()

    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

Epoch 0, Loss: 1.1696
Epoch 10, Loss: 1.0486
Epoch 20, Loss: 0.9265
Epoch 30, Loss: 0.7889
Epoch 40, Loss: 0.6737


Trains the neural network using backpropagation over 50 epochs.



##**Evaluate the model**

In [None]:
with torch.no_grad():
    preds = model(X_test).argmax(dim=1)
    accuracy = (preds == y_test).float().mean()
    print(f"Test Accuracy: {accuracy:.2f}")

Test Accuracy: 0.77


Evaluates the model performance on the test dataset.



##**Prepare for User Input Simulation**

In [None]:
# Simulate new student input
new_score = 72  # Example score
new_time_spent = 15  # in minutes
new_topic = 'Algebra'

Sets up simulated inputs to test personalized learning recommendations.



##**Encode New Topic Input**

In [None]:
# Step 1: Load dataset
df = pd.read_csv("/content/youtube_khan_academy.csv")  # Make sure the path is correct

# Step 2: Fit LabelEncoder on 'topic' column (assuming 'topic' = title or some other column)
# 👉 You can change 'topic_column' to the actual name like 'title', 'channel_title', etc.
topic_column = 'title'  # Change this to whichever column you're treating as topic
le = LabelEncoder()
df['topic_encoded'] = le.fit_transform(df[topic_column])

# Step 3: Example new topic (user input or test case)
new_topic = "Introduction to Algebra"  # Replace this with your actual new topic input

# Step 4: Transform the new topic safely
try:
    new_topic_encoded = le.transform([new_topic])[0]
    print("Encoded Topic:", new_topic_encoded)
except ValueError:
    print("⚠️ Error: New topic not seen during training. Consider refitting encoder with new data.")


⚠️ Error: New topic not seen during training. Consider refitting encoder with new data.


Encodes a new topic title using LabelEncoder, and handles unseen topics.



##**Setup for Web App (Streamlit)**

In [None]:
!pip install streamlit pyngrok




In [None]:
# %%writefile app.py
# import streamlit as st
# import pandas as pd
# from sklearn.preprocessing import LabelEncoder

# df = pd.read_csv("youtube_khan_academy.csv")

# topic_column = 'title'
# le = LabelEncoder()
# df['topic_encoded'] = le.fit_transform(df[topic_column])

# st.title("📚 AI-Based Personalized Learning Platform")
# user_topic = st.text_input("🔍 Enter a learning topic (video title):")

# if user_topic:
#     try:
#         encoded = le.transform([user_topic])[0]
#         st.success(f"✅ Encoded topic: {encoded}")
#         recommended = df[df['topic_encoded'] == encoded].head(5)
#         st.write("🎯 Recommended videos:")
#         for title in recommended['title']:
#             st.markdown(f"- {title}")
#     except ValueError:
#         st.error("⚠️ Topic not seen in training data. Try a different title.")


In [None]:
#!2vVAEnQvYMgEWL4kswnZGSOUeSl_3Yoqece2RAUYwT5N4BJ3F

In [None]:
#from pyngrok import ngrok
#import threading
#import os
#import time

#def run():
 #   os.system("streamlit run app.py")

#threading.Thread(target=run).start()
#time.sleep(5)  # wait a few seconds

#public_url = ngrok.connect(8501)
#print(f"🌐 Open your app here:\n{public_url}")



In [None]:
#!cat /root/.streamlit/logs/*.log | tail -n 30


##**Streamlit App (Basic UI)**

In [None]:
# Importing necessary libraries
import streamlit as st
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset (ensure the path is correct)
df = pd.read_csv("/content/youtube_khan_academy.csv")

# Assuming 'title' is the column for video titles
topic_column = 'title'

# Label Encoding for the video titles
le = LabelEncoder()
df['topic_encoded'] = le.fit_transform(df[topic_column])

# Streamlit App Title
st.title("📚 AI-Based Personalized Learning Platform")

# User input for video title
user_topic = st.text_input("🔍 Enter a learning topic (video title):")

# When user inputs a topic, this block runs
if user_topic:
    try:
        # Encoding the input title
        encoded = le.transform([user_topic])[0]
        st.success(f"✅ Encoded topic: {encoded}")

        # Getting recommended videos based on encoded title
        recommended = df[df['topic_encoded'] == encoded].head(5)

        # Displaying the recommended videos
        st.write("🎯 Recommended videos:")

        for index, row in recommended.iterrows():
            title = row['title']
            video_url = row.get('url', '')  # Ensure CSV has 'url' column

            # Check if the URL exists and is valid
            if pd.notna(video_url) and video_url != '':
                st.markdown(f"- [{title}]({video_url})")  # Link the title to the URL
            else:
                st.markdown(f"- {title} (No URL available)")  # If no URL is available

    except ValueError:
        # If the title is not found in the encoding
        st.error("⚠️ Topic not seen in training data. Try a different title.")


2025-04-10 18:27:31.993 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-04-10 18:27:32.000 Session state does not function when running a script without `streamlit run`


##**Generate Public Link with Ngrok**

In [None]:
# Importing necessary libraries
from pyngrok import ngrok
import threading
import os
import time

# Function to run Streamlit in a separate thread
def run():
    os.system("streamlit run app.py")

# Starting the Streamlit app in a separate thread
threading.Thread(target=run).start()

# Wait a few seconds for Streamlit to start
time.sleep(5)

# Create a public URL using ngrok
public_url = ngrok.connect(8501)

# Print the public URL where you can access the app
print(f"🌐 Open your app here:\n{public_url}")


🌐 Open your app here:
NgrokTunnel: "https://eefc-34-118-242-27.ngrok-free.app" -> "http://localhost:8501"


Automatically starts Streamlit and exposes it via a public URL using ngrok.



In [None]:
!cat /root/.streamlit/logs/*.log | tail -n 30


cat: '/root/.streamlit/logs/*.log': No such file or directory
