# Job Salary Prediction - Google Cloud Training

This notebook trains the salary prediction model using Google Colab (free) while connecting to Google Cloud Storage.

**Setup Requirements:**
1. Upload your `colab-key.json` file
2. Run all cells in order


In [None]:
!pip install -q google-cloud-storage scikit-learn pandas numpy joblib
print('✅ Dependencies installed!')

## Authenticate with Google Cloud

**Upload your colab-key.json file using the file upload button on the left sidebar first!**

In [None]:
import os
from google.colab import files

print('Upload your colab-key.json file:')
uploaded = files.upload()

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'colab-key.json'
print('✅ Credentials configured!')

## Download Data from Cloud Storage

In [None]:
from google.cloud import storage
import pandas as pd

client = storage.Client()
bucket = client.bucket('job-salary-ml-data-1770491809')
blob = bucket.blob('data/cleaned_jobs.csv')
blob.download_to_filename('cleaned_jobs.csv')

df = pd.read_csv('cleaned_jobs.csv')
print(f'✅ Data downloaded! Shape: {df.shape}')
df.head()

In [None]:
from sklearn.model_selection import train_test_split

df['combined_text'] = df['job_title_clean'] + ' ' + df['job_description_clean']
X = df['combined_text']
y = df['salary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f'Train: {len(X_train)}, Test: {len(X_test)}')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000, min_df=2, max_df=0.8, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print(f'Features: {X_train_tfidf.shape[1]}')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Naive Bayes': MultinomialNB()
}

results = {}
best_model, best_score, best_name = None, 0, ''

for name, model in models.items():
    print(f'\nTraining {name}...')
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f'Accuracy: {accuracy:.4f}')
    print(classification_report(y_test, y_pred))
    if accuracy > best_score:
        best_score, best_model, best_name = accuracy, model, name

print(f'\n✅ Best: {best_name} - {best_score:.4f}')

In [None]:
import joblib
import os

os.makedirs('model_output', exist_ok=True)
joblib.dump(best_model, 'model_output/model.pkl')
joblib.dump(vectorizer, 'model_output/vectorizer.pkl')

with open('model_output/model_info.txt', 'w') as f:
    f.write(f'Best Model: {best_name}\nAccuracy: {best_score:.4f}\nPlatform: Google Colab\n')

bucket = client.bucket('job-salary-ml-data-1770491809')
for file in ['model.pkl', 'vectorizer.pkl', 'model_info.txt']:
    blob = bucket.blob(f'models/{file}')
    blob.upload_from_filename(f'model_output/{file}')
    print(f'✅ Uploaded {file}')

print('\n🎉 Training Complete! Models saved to Cloud Storage.')