# Data Preprocessing

Import necessary libraries

In [286]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


Downloading NLTK resources

In [287]:
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cheta\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Load Datasets

In [288]:
it_data = pd.read_csv('it_dataset.csv')
non_it_data = pd.read_csv('non_it_dataset.csv')

In [289]:
it_data.head()

Unnamed: 0,Job Role,Skills,Certifications,Courses,Projects,Experience Level,Education,Industry,Keywords,Technologies
0,DevOps Engineer,"ReactJS, Java, Docker, C++, TensorFlow",Oracle Certified DBA,"Data Structures, Cybersecurity Fundamentals, C...",Data Visualization Tool,Senior,B.Tech in Computer Science,Cybersecurity,"C++, SQL","Git, Kubernetes, Docker"
1,AI Engineer,"SQL, Docker, Java, Kubernetes","Certified Data Scientist, Oracle Certified DBA","Cybersecurity Fundamentals, Machine Learning, ...","Fake News Detection, Network Security System",Mid,B.Tech in Computer Science,Cybersecurity,"NodeJS, C++, Python","Kubernetes, Azure, Docker"
2,Backend Developer,"Docker, SQL, NodeJS","Certified Data Scientist, Google Cloud Profess...","Web Development, Cloud Computing",E-commerce Website,Entry,Diploma in IT,AI,"Docker, Kubernetes","Azure, Kubernetes, Git"
3,Software Engineer,"Kubernetes, NodeJS, Docker","Google Cloud Professional, Certified Data Scie...","Cybersecurity Fundamentals, Data Structures, M...","E-commerce Website, Chatbot Development",Mid,MCA,IT,"Docker, Java, Python, TensorFlow","Kubernetes, Git, Azure"
4,Cybersecurity Analyst,"Java, C++, Docker, ReactJS","Oracle Certified DBA, Microsoft Azure Administ...","Cybersecurity Fundamentals, Web Development, M...","Data Visualization Tool, E-commerce Website",Mid,B.Tech in Computer Science,Cloud Computing,"JavaScript, TensorFlow, Python, NodeJS","Azure, Docker, AWS"


In [290]:
it_data.shape

(1000, 10)

In [291]:
non_it_data.head()

Unnamed: 0,Job Role,Skills,Certifications,Courses,Projects,Experience Level,Education,Industry,Keywords,Technologies
0,Marketing Manager,"Customer Service, Data Analysis, Project Manag...","Certified HR Professional, PMP, Certified Fina...","Marketing Strategies, Project Coordination, Bu...","Supply Chain Optimization, Sales Strategy Deve...",Senior,MBA,Operations,"Problem Solving, Data Analysis","PowerPoint, SAP"
1,HR Specialist,"Market Research, Data Analysis, Budgeting","Digital Marketing Associate, Lean Six Sigma","Project Coordination, Financial Accounting, Ma...",Sales Strategy Development,Mid,MBA,Operations,"Training, Leadership, Market Research, Budgeting","SAP, CRM Systems"
2,Business Analyst,"Data Analysis, Communication, Problem Solving,...","PMP, Certified Financial Analyst, Digital Mark...","Business Management, Project Coordination",Employee Engagement Survey,Mid,MBA,Marketing,"Training, Data Analysis","Business Intelligence Tools, CRM Systems, SAP"
3,HR Specialist,"Sales Strategy, Budgeting, Data Analysis, Trai...","PMP, Lean Six Sigma","Marketing Strategies, Leadership Skills, Busin...","Supply Chain Optimization, Customer Satisfacti...",Senior,MBA,Marketing,"Market Research, Sales Strategy, Training, Pro...","Excel, Business Intelligence Tools"
4,HR Specialist,"Budgeting, Training, Sales Strategy","Lean Six Sigma, PMP, Certified HR Professional","Business Management, Financial Accounting",Employee Engagement Survey,Mid,MBA,Operations,"Sales Strategy, Project Management","CRM Systems, Business Intelligence Tools"


In [292]:
non_it_data.shape

(1000, 10)

Handling Missing Values

In [293]:
it_data.fillna("None", inplace=True)
non_it_data.fillna("None", inplace=True)

Text Preprocessing

In [294]:
lemmatizer = WordNetLemmatizer()

In [295]:
def preprocess_text(df, columns):
    for col in columns:
        df[col] = df[col].str.lower()
        df[col] = df[col].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))
    return df

In [296]:
text_columns = ['Skills', 'Projects', 'Certifications', 'Courses', 'Technologies']
it_data = preprocess_text(it_data, text_columns)
non_it_data = preprocess_text(non_it_data, text_columns)

Encoding Job Roles

In [297]:
combined_labels = pd.concat([it_data['Job Role'], non_it_data['Job Role']])
label_encoder = LabelEncoder()
combined_y = label_encoder.fit_transform(combined_labels)
y_it = combined_y[:len(it_data)]
y_non_it = combined_y[len(it_data):]

Vectorizing Text Data

In [298]:
combined_data = pd.concat([it_data, non_it_data])
vectorizer = TfidfVectorizer(max_features=5000)
X_combined = vectorizer.fit_transform(combined_data['Skills'] + ' ' + combined_data['Projects']).toarray()

X_it = X_combined[:len(it_data)]
X_non_it = X_combined[len(it_data):]


In [299]:
print("Data Preprocessing Completed!")

Data Preprocessing Completed!


# Feature Engineering Model

In [300]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle

Train-Test Split

In [301]:
X_train_it, X_test_it, y_train_it, y_test_it = train_test_split(X_it, y_it, test_size=0.2, random_state=42)
X_train_non_it, X_test_non_it, y_train_non_it, y_test_non_it = train_test_split(X_non_it, y_non_it, test_size=0.2, random_state=42)


Model Building - Random Forest Classifier with GridSearch

In [302]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5]
}

Train IT Model

In [303]:
def train_model(X_train, y_train):
    grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

In [304]:
rf_it = train_model(X_train_it, y_train_it)

Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}


Train Non-IT Model

In [305]:
rf_non_it = train_model(X_train_non_it, y_train_non_it)

Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}


Model Evaluation

In [306]:
def evaluate_model(model, X_test, y_test, label):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{label} Model Accuracy: {accuracy}')
    print(f'{label} Classification Report:\n', classification_report(y_test, y_pred))


In [307]:
evaluate_model(rf_it, X_test_it, y_test_it, "IT")
evaluate_model(rf_non_it, X_test_non_it, y_test_non_it, "Non-IT")

IT Model Accuracy: 0.075
IT Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        22
           1       0.00      0.00      0.00        11
           3       0.00      0.00      0.00        18
           5       0.12      0.08      0.10        26
           6       0.25      0.12      0.16        26
           7       0.06      0.07      0.06        15
           8       0.04      0.06      0.05        17
          10       0.12      0.07      0.09        27
          11       0.08      0.10      0.09        20
          18       0.15      0.22      0.18        18

    accuracy                           0.07       200
   macro avg       0.08      0.07      0.07       200
weighted avg       0.10      0.07      0.08       200

Non-IT Model Accuracy: 0.105
Non-IT Classification Report:
               precision    recall  f1-score   support

           2       0.10      0.12      0.11        25
           4       0

In [308]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_it, y_train_it)

print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 100}
Best Accuracy: 0.14125000000000001


Save Models

In [309]:
def save_model(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)
    print(f'Model saved as {filename}')

In [310]:
save_model(rf_it, 'it_model.pkl')
save_model(rf_non_it, 'non_it_model.pkl')

Model saved as it_model.pkl
Model saved as non_it_model.pkl


Recommendation and Skills

In [311]:
def load_model(filename):
    try:
        with open(filename, 'rb') as file:
            return pickle.load(file)
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

In [312]:
def recommend_job(project_desc, model, vectorizer, label_encoder):
    vector = vectorizer.transform([project_desc])
    prediction = model.predict(vector)
    return label_encoder.inverse_transform(prediction)[0]


In [313]:
def suggest_missing_skills(resume_skills, job_role, skills_dict):
    required_skills = skills_dict.get(job_role, [])
    missing_skills = [skill for skill in required_skills if skill not in resume_skills]
    return missing_skills

In [314]:
def recommend_courses_and_certs(missing_skills, courses_dict, certs_dict):
    recommended_courses = []
    recommended_certs = []
    for skill in missing_skills:
        recommended_courses.extend(courses_dict.get(skill, []))
        recommended_certs.extend(certs_dict.get(skill, []))
    return recommended_courses, recommended_certs

In [315]:

# Example Prediction
project_example = 'Developed an AI chatbot using Python and NLP techniques'
job_role = recommend_job(project_example, rf_it, vectorizer, label_encoder)
print(f'Recommended Job Role: {job_role}')

Recommended Job Role: DevOps Engineer


In [316]:
# Missing Skills and Recommendations
skills_dict = {'Data Scientist': ['Python', 'ML', 'Data Analysis'], 'Web Developer': ['HTML', 'CSS', 'JavaScript']}
courses_dict = {'Python': ['Python for Data Science', 'Advanced Python'], 'ML': ['Machine Learning A-Z']}
certs_dict = {'Python': ['Python Certification by Coursera'], 'ML': ['Machine Learning Certification by Stanford']}

resume_skills = ['Python', 'Data Analysis']
missing_skills = suggest_missing_skills(resume_skills, job_role, skills_dict)
recommended_courses, recommended_certs = recommend_courses_and_certs(missing_skills, courses_dict, certs_dict)

print(f'Missing Skills: {missing_skills}')
print(f'Recommended Courses: {recommended_courses}')
print(f'Recommended Certifications: {recommended_certs}')
print('Model training and recommendation completed!')

Missing Skills: []
Recommended Courses: []
Recommended Certifications: []
Model training and recommendation completed!
