In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

In [7]:
# Load the dataset
df = pd.read_csv('Salary Data.csv')

In [9]:
df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [10]:
df.tail()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
370,35.0,Female,Bachelor's,Senior Marketing Analyst,8.0,85000.0
371,43.0,Male,Master's,Director of Operations,19.0,170000.0
372,29.0,Female,Bachelor's,Junior Project Manager,2.0,40000.0
373,34.0,Male,Bachelor's,Senior Operations Coordinator,7.0,90000.0
374,44.0,Female,PhD,Senior Business Analyst,15.0,150000.0


In [11]:
df.isna().sum()

Unnamed: 0,0
Age,2
Gender,2
Education Level,2
Job Title,2
Years of Experience,2
Salary,2


In [12]:
df_clean = df.dropna()

In [13]:
# Define features and target
X = df_clean.drop(columns='Salary')
y = df_clean['Salary']

In [14]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# Define preprocessing steps
numerical_features = ['Age', 'Years of Experience']
categorical_features = ['Gender', 'Education Level', 'Job Title']
numerical_transformer = SimpleImputer(strategy='mean')
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [16]:
# Define the model
model = LinearRegression()

In [18]:
# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

In [19]:
# Fit the model
clf.fit(X_train, y_train)

In [20]:
# Make predictions
predictions = clf.predict(X_test)
print(predictions)

[179927.39909837  99014.90078334 119092.58207287 100000.34342939
 130627.64400715 144229.84269612 139985.0062202  119342.21747047
  43868.43613686  98210.7463636  134732.55115015 137195.60124405
  58495.5660977   87463.2251232  116805.65555233  43868.43613686
  76788.7993223   64863.36617191  96211.92102463  92944.20081348
 118858.41020181 111995.24400191 134622.90548121  50278.44494725
 100000.34342939  58236.8284042  164005.65522752  41521.26357646
 110216.02112658 155157.88538052  63762.11164782 111817.23270391
  69547.59165383 143124.40156912 109491.05898961  29943.74985446
 169913.18558229  68404.12839356 149300.83939778  52744.22054093
  46661.41132187  91728.96297631 144246.59485909 177640.47257783
  63449.84724563 139750.83434915 106611.32401416  65435.09780205
  48856.66809971  62618.64838755 166143.60711511  98126.76903128
 129190.65968452  92476.23255274 110000.56134812 115612.91607978
 165465.92316714  61913.64740092 179927.39909837  96754.3199635
 109072.55786631 148347.59

In [24]:
clf.score(X_test, y_test)

0.8522462840519655