In [None]:
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
import pandas as pd
plt.rcParams['figure.figsize'] =  (20,10)


In [None]:
df = pd.read_csv('/kaggle/input/infoseccyber-security-salaries/Cyber_salaries.csv')   
country_cc = pd.read_csv("/kaggle/input/countries-codes-and-subordinates/countries_codes_and_coordinates.csv").rename(columns={"alfa2":"code","latitud":"lat","longitud":"long"})[["code","lat","long"]]

# EDA

In [None]:
country_cc.info()

In [None]:
#convert to cartesion coordinates ' assuming a sphere , this will not impact the analysis '
R = 6371
lat = np.deg2rad(country_cc.lat)
longt = np.deg2rad(country_cc.long)
x =  R * np.cos(lat) * np.cos(longt)
y =  R * np.cos(lat) * np.sin(longt)
z = R * np.sin(lat)
country_cc["x"] = x
country_cc["y"] = y
country_cc["z"] = z
country_cc = country_cc.drop(columns=["lat","long"])

In [None]:
country_cc

In [None]:
from sklearn.cluster import KMeans
km_res = KMeans(n_clusters=25)
km_res.fit_transform(country_cc[["x","y","z"]])
country_cc["country_cluster"] = km_res.labels_
country_cc = country_cc[["code","country_cluster"]]
country_cc = country_cc.set_index("code")


In [None]:
df = df.set_index("employee_residence").join(country_cc).reset_index().rename(columns={"index":"employee_residence","country_cluster":"employee_residence_cluster"})
df = df.set_index("company_location").join(country_cc).reset_index().rename(columns={"index":"company_location","country_cluster":"company_location_cluster"})


In [None]:
country_cc = pd.Series(country_cc["country_cluster"],index=country_cc.index)


In [None]:
country_cc

In [None]:
df.info()

In [None]:
df = df.astype({"remote_ratio" : "string","work_year": "string"})

In [None]:
num_des=df.describe()
num_des

In [None]:
cat_des = df.drop(columns=df.describe().columns.values)
cat_des.describe()


In [None]:
df["logsalary"]=  np.log1p(df["salary_in_usd"])


In [None]:
def cat_plot(x,y):
    fig,axs = plt.subplots(nrows=2,ncols=1,figsize=(20,20))
    sns.histplot(data=df,x=x,hue=y,multiple="layer",kde=True,ax=axs[0])
    sns.violinplot(data=df,y=x,x=y,ax=axs[1])

In [None]:
cat_plot("logsalary","work_year")

In [None]:
cat_plot("logsalary","experience_level")

In [None]:
df["is_ft"] = (df["employment_type"] == "FT").astype("int64")
cat_plot("logsalary","is_ft")

In [None]:
cat_plot("logsalary","remote_ratio")

In [None]:
cat_plot("logsalary","company_size")

### now for those : 
["employee_residence","salary_currency","job_title","company_location"]

In [None]:
df.groupby("employee_residence").size().sort_values(ascending=False)

In [None]:
cat_plot("logsalary","company_location_cluster")



In [None]:
# cat_plot("logsalary","employement_type")


In [None]:
cat_plot("logsalary","employee_residence_cluster")


In [None]:
df["far_from_work"] = (df["employee_residence"] == df["company_location"]).astype("int64")

In [None]:
cat_plot("logsalary","far_from_work")

In [None]:
cat_plot("logsalary","salary_currency")

In [None]:
df["job"] = df.job_title.apply(lambda d: d.split(" ")[-1]).replace(["Hunter","Lead","Hacker","3"],"Other")
df.groupby("job").size()

In [None]:
cat_plot("logsalary","job")

In [None]:
# df["sector"] = df.job_title.apply(lambda d: d.split(" ")[0])
# df.groupby("sector").size().sort_values(ascending=False)

In [None]:
# df["sector"].replace(
#     ["DevSecOps","DevOps"],
#     "DevOps").replace(
#     ["Azure","Infrastructure","Network"],
#     "Cloud")

In [None]:
# cat_plot("logsalary","sector")

# Predict Salary

In [None]:
from matplotlib import pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer 
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,get_scorer_names
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OrdinalEncoder,OneHotEncoder,StandardScaler
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict,cross_val_score,RandomizedSearchCV,GridSearchCV,cross_validate,train_test_split
from scipy.stats import uniform



In [None]:
df = pd.read_csv("/kaggle/input/infoseccyber-security-salaries/Cyber_salaries.csv")

In [None]:
df.columns.values  

In [None]:
df.experience_level.unique()

In [None]:
features_to_keep = ['work_year',
       'experience_level',
       'salary_currency',
       'remote_ratio',
       'company_size',
       'employee_residence_cluster',
       'company_location_cluster',
       'is_ft', 
       'far_from_work',
       'job']
target = df['salary_in_usd']

In [None]:
X_test,X_valid,y_test,y_valid = train_test_split(df.drop(columns=["salary_in_usd"]),target)

## Prepare SKlearn pipeline

In [None]:
class get_city_cluster(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return X.apply(lambda d: (country_cc[d]).values,axis=0)
    
class get_currency_mean(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        self.res = pd.DataFrame({"val":y,"cur": X.ravel()}).groupby("cur").mean()
        self.res = pd.concat([self.res,pd.DataFrame({"val": [0],"cur": ["none"]}).set_index("cur")])
        return self
    def transform(self,X,y=None):
        inputs = X.ravel()
        return self.res.loc[[a if a in self.res.index else "none" for a in inputs]].values
    
class get_is_ft(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return ((X == "FT").astype("int64").values.reshape(-1,1))
    
class get_printer(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        print(X)
        return X
class get_is_far_from_work(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return  (X.iloc[:,0] != X.iloc[:,1]).astype("int64").values.reshape(-1,1)
    
class get_job(BaseEstimator,TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self,X,y=None):
        return (X.iloc[:,0].apply(lambda d: d.split(" ")[-1]).replace(["Hunter","Lead","Hacker","3"],"Other")).values.reshape(-1,1)

In [None]:
prepare_pipeline = make_pipeline(make_column_transformer(
        (get_city_cluster(),[6,8]),
        (get_is_ft(),[2]),
        (get_is_far_from_work(),[6,8]),
        (get_job(),[3]),
        remainder="passthrough"
    ),
    make_column_transformer(
        (get_currency_mean(),[8]),
        (OneHotEncoder(handle_unknown="ignore"),[4]),
        (OrdinalEncoder(categories=[[2020,2021,2022],["EN","MI","SE","EX"],[0,50,100],["S","M","L"]]),[5,6,9,10]),
        ("passthrough",[0,1,2,3] ) ,
    )
    )


In [None]:
# prepare_pipeline = make_pipeline(make_column_transformer(
#         (get_city_cluster(),[6,8]),
#         (get_is_ft(),[2]),
#         (get_is_far_from_work(),[6,8]),
#         (get_job(),[3]),
#         remainder="passthrough"
#     ),
#     make_column_transformer(
#         (OneHotEncoder(handle_unknown="ignore"),[4,8]),
#         (OrdinalEncoder(categories=[[2020,2021,2022],["EN","MI","SE","EX"],[0,50,100],["S","M","L"]]),[5,6,9,10]),
#         ("passthrough",[0,1,2,3] ) ,
#     )
#     )

In [None]:
prepare_pipeline.fit(X_test,y_test)

In [None]:
# from scipy.stats import randint
# rsv = RandomizedSearchCV(RandomForestRegressor(bootstrap=True),
#     {
#         "n_estimators": randint(200,600), 
#         'max_depth': [40,80,120,160],
#         'min_samples_leaf': [1, 2, 4],
#         'min_samples_split': [2, 3, 5]
#     },random_state=0,n_iter=50,n_jobs=-1)


## Model Training And Evaluation

In [None]:
from xgboost import XGBRegressor


rnd = XGBRegressor(n_estimators=750, learning_rate=0.007,eval_metric="mae")

In [None]:
# rnd = RandomForestRegressor(**{'max_depth': 80,
#  'min_samples_leaf': 4,
#  'min_samples_split': 2,
#  'n_estimators': 300})

In [None]:
#rsv.fit(prepare_pipeline.transform(X_test),y=y_test)
#rnd.fit(prepare_pipeline.transform(X_test),y=y_test)

rnd.fit(prepare_pipeline.transform(X_test),y=y_test,
        eval_set=[(prepare_pipeline.transform(X_valid),y_valid)],
     
        verbose=True)


In [None]:
#rnd = rsv.best_estimator_

In [None]:
test_predictions = rnd.predict(prepare_pipeline.transform(X_test))
valid_predictions = rnd.predict(prepare_pipeline.transform(X_valid))

In [None]:
mean_absolute_error(test_predictions,y_test)

In [None]:
test_results = pd.DataFrame({"raw":y_test,"predictions":test_predictions})

In [None]:
test_results.describe()

In [None]:
mean_absolute_error(valid_predictions,y_valid)

In [None]:
valid_results = pd.DataFrame({"raw":y_valid,"predictions":valid_predictions})

In [None]:
valid_results.describe()

In [None]:
cv = pd.DataFrame(cross_validate(rnd,prepare_pipeline.transform(df.drop(columns=["salary_in_usd"])),df["salary_in_usd"],return_train_score=True,scoring="neg_mean_absolute_error",cv=50,n_jobs=-1))

In [None]:
cv = cv.sort_values("test_score",ascending=False)
cv

In [None]:
sns.histplot(data=pd.DataFrame(cv.drop(columns=["fit_time","score_time"]).stack()).reset_index(),x=0,hue="level_1",kde=True)

In [None]:
sns.regplot(data=cv.apply(lambda d: np.abs(d)),x="train_score",y="test_score")
sns.lineplot(data=cv.apply(lambda d: np.abs(d)),x="train_score",y="test_score")

<!-- # Random Forest Parameters study: -->

In [None]:
# n_estimators = np.array([  1,  13,  49, 111, 184, 233, 282,343,404, 465,551,600])
# max_depths = np.array([  5, 40,  80, 120, 150, 220, 300])
# max_features = np.array([0.25, 0.5 , 0.75, 1 ])

In [None]:
# from datetime import date
# import os

# td = date.today()
# os.mkdir(f"./results_{td}")
# inputs = prepare_pipeline.transform(df.drop(columns=["salary_in_usd"]))
# target = df["salary_in_usd"]
# counter = 0
# for est in n_estimators:
#     for dpt in max_depths:
#         print((est,dpt))
#         for ft in  max_features:
#             counter = counter+1
#             md = RandomForestRegressor(n_estimators=est,max_depth=dpt,max_features=ft)
#             results = pd.DataFrame(cross_validate(md,inputs,target,return_train_score=True,scoring="neg_mean_absolute_error",cv=75,n_jobs=-1))
#             results = results.sort_values("test_score")[["fit_time","test_score","train_score"]]
#             results["index"]=counter
#             results["n_estimators"]=est
#             results["max_depth"]=dpt
#             results["max_features"]=ft
#             results.to_csv(f"./results_{td}/random_forest_results_{counter}.csv")
            
            
            

In [None]:
# sns.barplot(data=pd.DataFrame(rnd.feature_importances_).reset_index(),x="index",y=0)