In [4]:
import zipfile

with zipfile.ZipFile("/content/archive.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/")


In [5]:
import os
os.listdir("/content/")


['.config', 'job_title_des.csv', 'archive.zip', 'sample_data']

In [7]:
import pandas as pd

# Load your CSV file
df = pd.read_csv("/content/job_title_des.csv")

# Show top 5 rows
df.head()


Unnamed: 0.1,Unnamed: 0,Job Title,Job Description
0,0,Flutter Developer,We are looking for hire experts flutter develo...
1,1,Django Developer,PYTHON/DJANGO (Developer/Lead) - Job Code(PDJ ...
2,2,Machine Learning,"Data Scientist (Contractor)\n\nBangalore, IN\n..."
3,3,iOS Developer,JOB DESCRIPTION:\n\nStrong framework outside o...
4,4,Full Stack Developer,job responsibility full stack engineer – react...


In [8]:
# Check basic structure
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2277 entries, 0 to 2276
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       2277 non-null   int64 
 1   Job Title        2277 non-null   object
 2   Job Description  2277 non-null   object
dtypes: int64(1), object(2)
memory usage: 53.5+ KB


In [9]:
# How many nulls in each column?
df.isnull().sum()


Unnamed: 0,0
Unnamed: 0,0
Job Title,0
Job Description,0


In [12]:
print(df.columns.tolist())


['Unnamed: 0', 'Job Title', 'Job Description']


In [14]:
df['Job Title'].value_counts().head(20)  # ← exact match needed


Unnamed: 0_level_0,count
Job Title,Unnamed: 1_level_1
JavaScript Developer,166
Java Developer,161
Software Engineer,160
Node js developer,160
iOS Developer,159
PHP Developer,156
Flutter Developer,155
DevOps Engineer,155
Django Developer,152
Machine Learning,152


In [18]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()  # lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # remove punctuation
    text = ' '.join(word for word in text.split() if word not in stop_words)  # remove stopwords
    return text

# Apply on job descriptions
df['cleaned_desc'] = df['Job Title'].apply(clean_text)

# Preview result
df[['Job Title', 'cleaned_desc']].head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Job Title,cleaned_desc
0,Flutter Developer,flutter developer
1,Django Developer,django developer
2,Machine Learning,machine learning
3,iOS Developer,ios developer
4,Full Stack Developer,full stack developer


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)  # max 5000 most frequent words

# Fit and transform the cleaned job descriptions
X = vectorizer.fit_transform(df['cleaned_desc'])


In [22]:
from sklearn.preprocessing import LabelEncoder

# Encode the job titles into numbers
le = LabelEncoder()
y = le.fit_transform(df['Job Title'])


In [23]:
from sklearn.model_selection import train_test_split

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Create the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Step 2: Fit on training data
model.fit(X_train, y_train)

# Step 3: Predict on test data
y_pred = model.predict(X_test)

# Step 4: Evaluate
print("✅ Accuracy Score:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_[:len(set(y_test))]))


✅ Accuracy Score: 1.0

📊 Classification Report:
                         precision    recall  f1-score   support

     Backend Developer       1.00      1.00      1.00        32
Database Administrator       1.00      1.00      1.00        26
       DevOps Engineer       1.00      1.00      1.00        38
      Django Developer       1.00      1.00      1.00        29
     Flutter Developer       1.00      1.00      1.00        35
  Full Stack Developer       1.00      1.00      1.00        23
        Java Developer       1.00      1.00      1.00        38
  JavaScript Developer       1.00      1.00      1.00        39
      Machine Learning       1.00      1.00      1.00        28
 Network Administrator       1.00      1.00      1.00        18
     Node js developer       1.00      1.00      1.00        34
         PHP Developer       1.00      1.00      1.00        26
     Software Engineer       1.00      1.00      1.00        32
   Wordpress Developer       1.00      1.00      1.00 

In [30]:
import joblib

joblib.dump(model, 'job_recommender_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(le, 'label_encoder.pkl')


['label_encoder.pkl']