In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
import joblib
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import os

# Load directory for features/vectorizer (input)
input_dir = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering'

# Save directory for model artifacts (output)
output_dir = '/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation'

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load TF-IDF features and vectorizer from input_dir
df = pd.read_pickle(os.path.join(input_dir, 'features_tfidf.pkl'))
vectorizer = joblib.load(os.path.join(input_dir, 'vectorizer_tfidf.pkl'))

# Prepare feature matrix
X = df.drop(['label', 'label_num'], axis=1).values

# Apply PCA
pca = PCA(n_components=100, random_state=42)
X_reduced = pca.fit_transform(X)

# Train KMeans with k=5
model = KMeans(n_clusters=5, random_state=42)
model.fit(X_reduced)

# Save model, PCA, vectorizer to output_dir
joblib.dump(model, os.path.join(output_dir, 'kmeans_tfidf_k5.pkl'))
joblib.dump(pca, os.path.join(output_dir, 'pca_tfidf_100.pkl'))
joblib.dump(vectorizer, os.path.join(output_dir, 'vectorizer_tfidf.pkl'))  # overwrite optional

# Get cluster labels
cluster_labels = model.labels_

# Add to dataframe
df['cluster_label'] = cluster_labels

# Save updated dataframe
output_path = os.path.join(output_dir, 'Kmeans_tfidf_with_clusters_label.xlsx')
df.to_excel(output_path, index=False)

print(f"Saved dataframe with cluster labels to: {output_path}")


Saved dataframe with cluster labels to: /content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation/Kmeans_tfidf_with_clusters_label.xlsx


In [6]:
# ======= UTILS TO GET TOP KEYWORDS PER CLUSTER =======

def get_top_keywords_for_clusters(kmeans_model, pca_model, vectorizer, n_keywords=10):
    """
    Invert PCA transform on cluster centers to get TF-IDF space,
    then find top keywords for each cluster center.
    """
    # Cluster centers in PCA space
    centers_pca = kmeans_model.cluster_centers_

    # Invert PCA to TF-IDF space
    centers_tfidf = pca_model.inverse_transform(centers_pca)

    # Get feature names
    terms = vectorizer.get_feature_names_out()

    top_keywords = {}
    for i, center in enumerate(centers_tfidf):
        # Indices of top features
        top_indices = center.argsort()[::-1][:n_keywords]
        keywords = [terms[idx] for idx in top_indices]
        top_keywords[i] = keywords
    return top_keywords



In [7]:

# ======= FLASK API =======

from flask import Flask, request, jsonify

app = Flask(__name__)

# Load models (adjust paths if needed)
model = joblib.load(os.path.join(output_dir, 'kmeans_tfidf_k5.pkl'))
pca = joblib.load(os.path.join(output_dir, 'pca_tfidf_100.pkl'))
vectorizer = joblib.load(os.path.join(output_dir, 'vectorizer_tfidf.pkl'))

# Precompute top keywords for clusters once at startup
top_keywords_per_cluster = get_top_keywords_for_clusters(model, pca, vectorizer)

@app.route('/cluster_predict', methods=['POST'])
def cluster_predict():
    data = request.get_json(force=True)
    text = data.get('text', '')
    if not text:
        return jsonify({'error': 'No text provided'}), 400

    # Vectorize text
    X_vec = vectorizer.transform([text])

    # Apply PCA
    X_vec_reduced = pca.transform(X_vec.toarray())

    # Predict cluster
    cluster_id = model.predict(X_vec_reduced)[0]

    # Get keywords for cluster
    keywords = top_keywords_per_cluster.get(cluster_id, [])

    return jsonify({
        "cluster_id": cluster_id,
        "top_keywords": keywords
    })

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:5000
 * Running on http://172.28.0.12:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
