In [2]:
import pandas as pd

# Load the dataset
file_path = r"https://github.com/AnkitKolhe149/job-hunt-analysis/blob/main/JOBLIST.xlsx"
data = pd.read_excel(file_path, header=0)

# Inspect the data
print(data.head())
print(data.columns)


                 Job Title                                    Company  \
0    Data Scientist Intern                      SoulPage IT Solutions   
1    Junior Data Scientist                  Ray Business Technologies   
2           Data Scientist                                   Accusaga   
3           Data Scientist  Great River Financial Services Inc (GRFS)   
4  Data Scientist – Intern                  Indian School of Business   

    Location       Salary  
0  Hyderabad   ₹3L - ₹4L   
1  Hyderabad  ₹5L - ₹19L   
2      India         ₹8L   
3      India   ₹5L - ₹9L   
4  Hyderabad  ₹5L - ₹19L   
Index(['Job Title', 'Company', 'Location', 'Salary'], dtype='object')


In [10]:
# Drop unnecessary columns (e.g., 'Unnamed: 4') if they exist
data = data.drop(columns=['Unnamed: 4'], errors='ignore')

# Drop rows with missing values in the 'Salary' column
data = data.dropna(subset=['Salary'])

# Inspect the cleaned data
print(data.head())


                 Job Title                                    Company  \
0    Data Scientist Intern                      SoulPage IT Solutions   
1    Junior Data Scientist                  Ray Business Technologies   
2           Data Scientist                                   Accusaga   
3           Data Scientist  Great River Financial Services Inc (GRFS)   
4  Data Scientist – Intern                  Indian School of Business   

    Location       Salary  
0  Hyderabad   ₹3L - ₹4L   
1  Hyderabad  ₹5L - ₹19L   
2      India         ₹8L   
3      India   ₹5L - ₹9L   
4  Hyderabad  ₹5L - ₹19L   


In [22]:
def extract_salary(salary_str):
    salary_str = str(salary_str)
    salary_range = []
    
    # Split the salary range (e.g., "₹3L - ₹4L" or "₹40T - ₹50T")
    for s in salary_str.split('-'):
        s = s.strip().replace('₹', '').replace(',', '')  # Remove currency symbols and commas
        
        # Convert 'L' to lakh (1L = 100,000) and 'T' to thousand (1T = 1,000)
        if 'L' in s:
            salary_range.append(float(s.replace('L', '')) * 100000)
        elif 'T' in s:
            salary_range.append(float(s.replace('T', '')) * 1000)
        else:
            salary_range.append(float(s))  # Handle plain numeric values if any
    
    return np.mean(salary_range) if salary_range else None  # Use the average of the range

# Apply the updated function to the Salary column
data['Salary'] = data['Salary'].apply(extract_salary)


# Apply log transformation
data['Salary'] = np.log1p(data['Salary'])

# Check the transformation
print(data['Salary'].head())

0    0.063704
1    0.205253
2    0.141129
3    0.124434
4    0.205253
Name: Salary, dtype: float64


In [25]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

# Define features and target variable
X = data[['Job Title', 'Company', 'Location']]  # Adjust as per actual features after encoding
y = data['Salary']

# Feature encoding
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_features = encoder.fit_transform(X[['Job Title', 'Company', 'Location']])

# Normalize features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(encoded_features)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(scaled_features, y, test_size=0.2, random_state=42)


In [33]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

# Set up the Ridge Regression model
ridge = Ridge()

# Hyperparameter grid search for 'alpha'
param_grid = {'alpha': np.logspace(-6, 6, 13)}  # Search for a wide range of alpha values
grid_search = GridSearchCV(ridge, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best alpha found through grid search
best_alpha = grid_search.best_params_['alpha']
print(f"Best alpha: {best_alpha}")

# Train the Ridge model with the best alpha
ridge_best = Ridge(alpha=best_alpha)
ridge_best.fit(X_train, y_train)

# Predict on the test set
y_pred = ridge_best.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")


Best alpha: 10000.0
Mean Squared Error (MSE): 0.00865922394764067
R-squared (R²): -0.009549902731481819
