**Setup & Load Data**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np

# For modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

# For evaluation
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report

In [5]:
# Load your cleaned LinkedIn job postings dataset
df = pd.read_csv('/content/drive/MyDrive/Project data set/completed task/cleaned_linkedin_jobs.csv')

In [8]:
# Quick look at the data
df.head(3)

Unnamed: 0,job_id,company_name,title,max_salary,pay_period,location,company_id,views,med_salary,min_salary,...,salary_id,name,company_size,state,country,city,address,url,skill_abr,skill_name
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,17.0,...,18531.0,Corcoran Sawyer Smith,2.0,NJ,US,Jersey City,242 Tenth Street,https://www.linkedin.com/company/corcoran-sawy...,MRKT,Marketing
1,921716,Corcoran Sawyer Smith,Marketing Coordinator,20.0,HOURLY,"Princeton, NJ",2774458.0,20.0,,17.0,...,18531.0,Corcoran Sawyer Smith,2.0,NJ,US,Jersey City,242 Tenth Street,https://www.linkedin.com/company/corcoran-sawy...,SALE,Sales
2,1829192,,Mental Health Therapist/Counselor,50.0,HOURLY,"Fort Collins, CO",,1.0,,30.0,...,8059.0,,,,,,,,HCPR,Health Care Provider


In [9]:
# Quick look at the data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207531 entries, 0 to 207530
Data columns (total 39 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      207531 non-null  int64  
 1   company_name                204705 non-null  object 
 2   title                       206863 non-null  object 
 3   max_salary                  50764 non-null   float64
 4   pay_period                  61100 non-null   object 
 5   location                    207531 non-null  object 
 6   company_id                  204709 non-null  float64
 7   views                       204630 non-null  float64
 8   med_salary                  10336 non-null   float64
 9   min_salary                  50764 non-null   float64
 10  formatted_work_type         207531 non-null  object 
 11  applies                     40337 non-null   float64
 12  original_listed_time        207531 non-null  float64
 13  remote_allowed

**Predict Salary (Regression)**

In [11]:
df.columns

Index(['job_id', 'company_name', 'title', 'max_salary', 'pay_period',
       'location', 'company_id', 'views', 'med_salary', 'min_salary',
       'formatted_work_type', 'applies', 'original_listed_time',
       'remote_allowed', 'job_posting_url', 'application_url',
       'application_type', 'expiry', 'closed_time',
       'formatted_experience_level', 'skills_desc', 'listed_time',
       'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'normalized_salary', 'fips', 'salary_id', 'name',
       'company_size', 'state', 'country', 'city', 'address', 'url',
       'skill_abr', 'skill_name'],
      dtype='object')

In [16]:
#Keep only rows where salary and key info are available
df_salary = df.dropna(subset=['normalized_salary', 'company_size', 'remote_allowed', 'formatted_experience_level', 'state'])

In [17]:
#Choose useful features
features = ['company_size', 'remote_allowed', 'formatted_experience_level', 'state']
target = 'normalized_salary'

**Encode Categorical Features**

In [39]:
# Encode categorical variables to numeric for modeling
label_encoders = {}  # Dictionary to store encoders for later use

for col in features:
    if df_salary[col].dtype == 'object':
        le = LabelEncoder()
        df_salary = df[df['some_column'].notnull()].copy()
        label_encoders[col] = le

In [19]:
#Split into training and test sets
X = df_salary[features]
y = df_salary[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Train a simple Linear Regression model
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)

# Step 2.6: Predict and evaluate
y_pred = model_lr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Salary Prediction RMSE: {rmse:.2f}")

Salary Prediction RMSE: 6745580.26


**Classifying Job Types (Classification)**

In [21]:
# Drop rows with missing values
df_classify = df.dropna(subset=['work_type', 'company_size', 'remote_allowed', 'formatted_experience_level', 'state', 'normalized_salary'])

In [38]:
#Encode target variable (work_type)
le_work_type = LabelEncoder()
df_classify = df[df['work_type'].notnull()].copy()
df_classify['work_type_encoded'] = le_work_type.fit_transform(df_classify['work_type'])

In [23]:
# Use same features as before, plus salary
features_cls = ['company_size', 'remote_allowed', 'formatted_experience_level', 'state', 'normalized_salary']
target_cls = 'work_type_encoded'

In [36]:
# Encode features if needed
for col in features_cls:
    if df_classify[col].dtype == 'object':
        le = LabelEncoder()
        df_classify = df[df['formatted_work_type'].notnull()].copy()
        label_encoders[col] = le

In [25]:
#Split data
X = df_classify[features_cls]
y = df_classify[target_cls]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [26]:
#Train a Random Forest Classifier
model_rfc = RandomForestClassifier(random_state=42)
model_rfc.fit(X_train, y_train)

In [27]:
# Predict and evaluate
y_pred = model_rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Job Type Classification Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred, target_names=le_work_type.classes_))

Job Type Classification Accuracy: 0.91
              precision    recall  f1-score   support

    CONTRACT       0.82      0.73      0.77      1016
   FULL_TIME       0.94      0.96      0.95      7390
  INTERNSHIP       0.88      0.86      0.87        59
       OTHER       0.89      0.86      0.88        37
   PART_TIME       0.74      0.66      0.70       528
   TEMPORARY       0.73      0.59      0.66       103
   VOLUNTEER       1.00      1.00      1.00         1

    accuracy                           0.91      9134
   macro avg       0.86      0.81      0.83      9134
weighted avg       0.91      0.91      0.91      9134



**Grouping Job Titles by Skills (Clustering)**

In [35]:
# Use only rows with listed skills
df_skills = df.dropna(subset=['skill_name'])

#Turn skill strings into a matrix (1 column per skill)
skills_matrix = df_skills['skill_name'].str.get_dummies(sep=',')

#Use KMeans to find clusters (I'll start with 5)
kmeans = KMeans(n_clusters=5, random_state=42)
df_skills = df[df['skill_name'].notnull()].copy()
df_skills['skill_cluster'] = kmeans.fit_predict(skills_matrix)


#Look at how many jobs in each cluster
df_skills['skill_cluster'].value_counts()

Unnamed: 0_level_0,count
skill_cluster,Unnamed: 1_level_1
2,31843
0,13494
1,7177
3,4660
4,3385


**Save Trained Models to Google Drive**

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
import joblib
import os

# Create directory in Drive if it doesn't exist
drive_path = "/content/drive/MyDrive/LinkedInProject/completed task/models"
os.makedirs(drive_path, exist_ok=True)

# Save models
joblib.dump(model_lr, f"{drive_path}/salary_model.pkl")
joblib.dump(model_rfc, f"{drive_path}/jobtype_model.pkl")
joblib.dump(label_encoders, f"{drive_path}/label_encoders.pkl")
joblib.dump(le_work_type, f"{drive_path}/worktype_encoder.pkl")

print("Models saved to Google Drive.")

Models saved to Google Drive.
