# Public Pulse AI - Model Training

Use this notebook to fine-tune the issue classifier using your custom dataset.

### Steps:
1. Upload your `csv` data file.
2. Run the training cells.
3. Download the trained `classifier.pkl` model.
4. Place the model in `services/ai-engine/models/` locally.

In [None]:
# 1. Install Dependencies
!pip install pandas scikit-learn spacy numpy
!python -m spacy download en_core_web_sm

In [None]:
# 2. Upload Data
from google.colab import files
import pandas as pd
import io

print("Please upload your CSV file (must have 'description' and 'type' columns)...")
uploaded = files.upload()

filename = next(iter(uploaded))
df = pd.read_csv(io.BytesIO(uploaded[filename]))
print(f"Loaded {len(df)} records.")
df.head()

In [None]:
# 3. Train Model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

# Clean Data
df = df.dropna(subset=['description', 'type'])
X = df['description']
y = df['type']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build Pipeline (TF-IDF + SVM)
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('clf', SGDClassifier(loss='modified_huber', max_iter=1000, tol=1e-3))
])

# Train
print("Training model...")
pipeline.fit(X_train, y_train)

# Evaluate
score = pipeline.score(X_test, y_test)
print(f"Test Accuracy: {score:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, pipeline.predict(X_test)))

# Save
with open('classifier.pkl', 'wb') as f:
    pickle.dump(pipeline, f)
print("âœ… Model saved as 'classifier.pkl'")

In [None]:
# 4. Download Model
files.download('classifier.pkl')