In [None]:
# CELL 1 – Imports
import pandas as pd
import numpy as np
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import joblib

nltk.download("punkt", quiet=True)
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)

print("NLP imports successful!")


In [None]:
# CELL 2 – Create study tips dataset
tips_data = [
    {
        "keyword": "machine learning",
        "tips": [
            "Review the difference between supervised and unsupervised learning.",
            "Practice implementing logistic regression from scratch.",
            "Study overfitting and regularization techniques.",
            "Work through Kaggle datasets to build intuition.",
        ]
    },
    {
        "keyword": "neural networks",
        "tips": [
            "Understand the forward pass and backpropagation.",
            "Experiment with different activation functions.",
            "Learn about vanishing gradient problem.",
            "Build a simple network from scratch.",
        ]
    },
    {
        "keyword": "deep learning",
        "tips": [
            "Study convolutional layers and pooling.",
            "Explore recurrent neural networks (RNNs).",
            "Learn about transfer learning.",
            "Use TensorFlow and Keras for practice.",
        ]
    },
    {
        "keyword": "python",
        "tips": [
            "Master list comprehensions and lambda functions.",
            "Practice working with NumPy arrays.",
            "Learn decorators and generators.",
            "Build small projects to apply concepts.",
        ]
    },
]

tips_df = pd.DataFrame(tips_data)
print(f"Study tips dataset: {len(tips_df)} entries")
tips_df.head()


In [None]:
# CELL 3 – Define keyword extractor function
def extract_keywords(text: str, top_n: int = 5) -> list:
    """
    Extract top N keywords from text using word frequency.
    Removes stopwords and common words.
    """
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    
    # Filter: remove stopwords, punctuation, and short words
    tokens = [
        t for t in tokens 
        if t.isalnum() and t not in stop_words and len(t) > 2
    ]
    
    # Get top N most common
    freq = Counter(tokens)
    top_keywords = freq.most_common(top_n)
    
    return [kw for kw, count in top_keywords]

# Test
test_text = "Machine learning models learn from data automatically."
keywords = extract_keywords(test_text, top_n=3)
print(f"Keywords from test text: {keywords}")


In [None]:
# CELL 4 – Define study tip generator
def get_study_tips(keywords: list, tips_df=tips_df) -> list:
    """
    Given a list of keywords, return matching study tips.
    """
    all_tips = []
    
    for kw in keywords:
        matches = tips_df[tips_df["keyword"].str.contains(kw, case=False, na=False)]
        if not matches.empty:
            for _, row in matches.iterrows():
                all_tips.extend(row["tips"])
    
    # Return unique tips
    return list(set(all_tips))[:5]  # limit to 5

# Test
test_keywords = ["machine", "learning"]
tips = get_study_tips(test_keywords)
print(f"Generated tips:\n" + "\n".join(f"- {t}" for t in tips))


In [None]:
# CELL 5 – Define resource recommender
resources_data = {
    "machine learning": [
        {"title": "Andrew Ng's ML Course", "url": "https://www.coursera.org/learn/machine-learning"},
        {"title": "Scikit-learn Documentation", "url": "https://scikit-learn.org/"},
    ],
    "deep learning": [
        {"title": "Deep Learning Specialization", "url": "https://www.coursera.org/specializations/deep-learning"},
        {"title": "PyTorch Tutorials", "url": "https://pytorch.org/tutorials/"},
    ],
    "python": [
        {"title": "Python Official Docs", "url": "https://docs.python.org/3/"},
        {"title": "Real Python", "url": "https://realpython.com/"},
    ],
    "neural networks": [
        {"title": "Neural Networks Explained", "url": "https://www.3blue1brown.com/"},
        {"title": "Keras Guide", "url": "https://keras.io/"},
    ],
}

def get_resources(keywords: list) -> list:
    """
    Recommend learning resources based on keywords.
    """
    recommended = []
    
    for kw in keywords:
        if kw in resources_data:
            recommended.extend(resources_data[kw])
    
    # Deduplicate and limit
    unique_resources = {r["title"]: r for r in recommended}.values()
    return list(unique_resources)[:4]

# Test
resources = get_resources(["machine learning", "python"])
print(f"Recommended resources:")
for r in resources:
    print(f"- {r['title']}: {r['url']}")


In [None]:
# CELL 6 – Save NLP components
models_dir = "../backend/models"
import os
os.makedirs(models_dir, exist_ok=True)

tips_path = os.path.join(models_dir, "study_tips_db.joblib")
resources_path = os.path.join(models_dir, "resources_db.joblib")

joblib.dump(tips_df, tips_path)
joblib.dump(resources_data, resources_path)

print(f"Tips database saved to: {tips_path}")
print(f"Resources database saved to: {resources_path}")
