# EMPLOYEE SALARY PREDICTION

### Importing Libraries

In [25]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error 

### Load the dataset 

In [28]:
df = pd.read_csv("salary_data.csv") 

### Display dataset preview 

In [31]:
print("Dataset Preview:") 
print(df.head()) 

Dataset Preview:
   years_experience  qualification   industry location  salary
0                 1              1         IT    CityA   35000
1                 3              2  Marketing    CityB   48000
2                 5              3    Finance    CityC   65000
3                 2              1         IT    CityA   37000
4                 4              2  Marketing    CityB   52000


### One-hot encoding for categorical features (industry and location) 

In [34]:
df = pd.get_dummies(df, columns=['industry', 'location'], drop_first=True) 

### Features and target 

In [37]:
X = df[['years_experience', 'qualification'] + [col for col in df.columns if 'industry_' in col or 
'location_' in col]] 
y = df['salary'] 

### Split the data into training and testing sets 

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 

### Initialize and train the Linear Regression model 

In [43]:
model = LinearRegression() 
model.fit(X_train, y_train)

### Make predictions 

In [46]:
y_pred = model.predict(X_test) 

### Evaluate the model 

In [49]:
mse = mean_squared_error(y_test, y_pred) 
print(f"Mean Squared Error: {mse}") 

Mean Squared Error: 15869286.972575957


### Ensure the new profile matches the training data structure  

In [52]:
new_data = pd.DataFrame({ 
'years_experience': [5], 
'qualification': [3], 
'industry_IT': [1], 
'industry_Marketing': [0], # Ensure this matches training columns 
'location_CityB': [1], 
'location_CityC': [0], 
}, columns=X_train.columns) # Ensure exact column alignment

### Predict salary for the specific profile 

In [55]:
predicted_salary = model.predict(new_data) 
print(f"Predicted salary for the new profile: {predicted_salary[0]:.2f}")

Predicted salary for the new profile: 60950.11
