In [1832]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_percentage_error, r2_score
from category_encoders import TargetEncoder
from sklearn.preprocessing import PowerTransformer

In [1833]:
# Load data
train_data = pd.read_csv("mod_04_hw_train_data.csv")
valid_data = pd.read_csv("mod_04_hw_valid_data.csv")

In [1834]:
# Initial Data Analysis (EDA)
print("Dataset Information:\n")
print(train_data.info())
print("\nDataset Shape:\n")
print(train_data.shape)
print("\nDescriptive Statistics:\n")
display(train_data.describe().transpose())
print("\nMissing Values:\n")
display(train_data.isnull().sum())
print("\nSample Data:\n")
display(train_data.head())

Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           249 non-null    object 
 1   Phone_Number   249 non-null    object 
 2   Experience     247 non-null    float64
 3   Qualification  248 non-null    object 
 4   University     249 non-null    object 
 5   Role           246 non-null    object 
 6   Cert           247 non-null    object 
 7   Date_Of_Birth  249 non-null    object 
 8   Salary         249 non-null    int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 17.6+ KB
None

Dataset Shape:

(249, 9)

Descriptive Statistics:



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Experience,247.0,3.441296,1.496471,1.0,2.0,4.0,5.0,5.0
Salary,249.0,98186.746988,23502.622217,49500.0,78500.0,104500.0,116500.0,141500.0



Missing Values:



Name             0
Phone_Number     0
Experience       2
Qualification    1
University       0
Role             3
Cert             2
Date_Of_Birth    0
Salary           0
dtype: int64


Sample Data:



Unnamed: 0,Name,Phone_Number,Experience,Qualification,University,Role,Cert,Date_Of_Birth,Salary
0,Jennifer Hernandez,120-602-1220,3.0,Msc,Tier2,Mid,Yes,25/08/1972,98000
1,Timothy Walker,840-675-8650,5.0,PhD,Tier2,Senior,Yes,03/12/2013,135500
2,David Duran,556-293-8643,5.0,Msc,Tier2,Senior,Yes,19/07/2002,123500
3,Gloria Ortega,463-559-7474,3.0,Bsc,Tier3,Mid,No,19/02/1970,85000
4,Matthew Steele,968-091-7683,5.0,Bsc,Tier2,Senior,Yes,20/02/1970,111500


In [1835]:
# Remove rows with missing values
train_data_clean = train_data.dropna()
valid_data_clean = valid_data.dropna()

In [1836]:
# Specify the fields to be removed
fields_to_remove = ["Name", "Phone_Number"]

# Remove the specified fields from both datasets
train_data_clean = train_data_clean.drop(columns=fields_to_remove)
valid_data_clean = valid_data_clean.drop(columns=fields_to_remove)

In [1837]:
# Data Processing
numeric_features = train_data_clean.select_dtypes(
    include=["int64", "float64"]
).columns.drop("Salary")
categorical_features = train_data_clean.select_dtypes(include=["object"]).columns

In [1838]:
# Preprocessing numeric features
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
)

# Apply transformations to numeric features
X_train_num = numeric_transformer.fit_transform(train_data_clean[numeric_features])
X_valid_num = numeric_transformer.transform(valid_data_clean[numeric_features])

# Convert back to DataFrame for easy concatenation
X_train_num = pd.DataFrame(X_train_num, columns=numeric_features)
X_valid_num = pd.DataFrame(X_valid_num, columns=numeric_features)

In [1839]:
# Preprocessing categorical features
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("target", TargetEncoder(handle_unknown="ignore")),
    ]
)

# Apply transformations to categorical features
X_train_cat = categorical_transformer.fit_transform(
    train_data_clean[categorical_features], train_data_clean["Salary"]
)
X_valid_cat = categorical_transformer.transform(valid_data_clean[categorical_features])

# Convert back to DataFrame for easy concatenation
X_train_cat = pd.DataFrame(
    X_train_cat,
    columns=categorical_transformer.named_steps["target"].get_feature_names_out(
        categorical_features
    ),
)
X_valid_cat = pd.DataFrame(
    X_valid_cat,
    columns=categorical_transformer.named_steps["target"].get_feature_names_out(
        categorical_features
    ),
)

In [1840]:
# Concatenate numeric and categorical features
X_train = pd.concat([X_train_num, X_train_cat], axis=1)
X_valid = pd.concat([X_valid_num, X_valid_cat], axis=1)

y_train = train_data_clean["Salary"]
y_valid = valid_data_clean["Salary"]

In [1841]:
# Create and train the pipeline for train data
model_train = Pipeline(steps=[("regressor", KNeighborsRegressor())])

In [1842]:
# Grid search for best parameters
param_grid = {"regressor__n_neighbors": [3, 5, 7, 9, 11, 13, 15]}

In [1843]:
grid_search = GridSearchCV(
    model_train, param_grid, cv=5, scoring="neg_mean_absolute_percentage_error"
)

In [1844]:
grid_search.fit(X_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [249, 241]

In [None]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

In [None]:
# Display best parameters and score as a DataFrame
results_df = pd.DataFrame({
    'Best Parameters': [best_params],
    'Best MAPE': [f"{best_score:.2%}"]
})
display(results_df)


Unnamed: 0,Best Parameters,Best MAPE
0,{'regressor__n_neighbors': 3},2.88%


In [None]:
y_pred = best_model.predict(X_valid)
mape = mean_absolute_percentage_error(y_valid, y_pred)
r2 = r2_score(y_valid, y_pred)

In [None]:
# Display evaluation metrics as a DataFrame
evaluation_df = pd.DataFrame({
    'Validation MAPE': [f"{mape:.2%}"],
    'Validation R^2 Score': [f"{r2:.2f}"],
})
display(evaluation_df)

Unnamed: 0,Validation MAPE,Validation R^2 Score
0,8.80%,0.62
