# Trenowanie modeli

## Pobieranie danych

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
# skills_classified_title_only = pd.read_csv("skills_classified_title_only.csv")
# skills_classified_title_only.head(3)

In [None]:
# skills_classified_title_only.shape

In [None]:
all_data_with_salary = pd.read_csv("../Data analysis/../Data analysis/wszystkie_dane_tylko_z_salary.csv")
all_data_with_salary.head(3)

In [None]:
all_data_with_salary.shape

## Przygotowanie danych

### Delete hours from scraped_at

In [None]:
all_data_with_salary.loc[:, "scraped_at"]

In [None]:
all_data_with_salary.loc[:, "scraped_at"] = all_data_with_salary.loc[:, "scraped_at"].str.split(" ").str[0]

In [None]:
all_data_with_salary.loc[:, "scraped_at"].head()

In [None]:
"2025-04-03 20:43:33 UTC".split(" ")[0]

### all_data_with_salary.head()

In [None]:
all_data_with_salary.shape

In [None]:
all_data_with_salary.columns[:12]

In [None]:
all_data_with_salary.columns[12:14]

### Pogrupowanie danych według dat

#### Count each skills and sort from most 

In [None]:
all_data_with_salary.loc[:, ".Net":]

In [None]:
all_data_with_salary.iloc[:, 12:].apply(pd.Series.value_counts).sum(axis=1)


In [None]:
all_data_with_salary.iloc[:, 12:] == 1

#### Find top 100 or less skills

In [None]:
top_skills = (all_data_with_salary.iloc[:, 12:] == 1).sum().sort_values(ascending=False)
top_skills

In [None]:
top_skills[top_skills > 0]

In [None]:
top_skills.describe()

In [None]:
len(top_skills[top_skills > 8])

In [None]:
top_skills = top_skills[top_skills > 8].sort_values(ascending=False).head(100)
top_skills

In [None]:
print(top_skills.values)
print(top_skills.tolist())

In [None]:
top_skills.index
top_skills_names = top_skills.index.tolist()
top_skills_values = top_skills.values.tolist()
print(top_skills_values)
print(top_skills_names)

#### Group by dates

In [None]:
all_data_with_salary.groupby("scraped_at")["Python"].count() # Count all rows (with 0 and 1)

In [None]:
all_data_with_salary.groupby("scraped_at")["Python"].sum() # Sum values

In [None]:
all_data_with_salary.groupby("scraped_at")["Python"].value_counts()

In [None]:
all_data_with_salary.groupby("scraped_at")["Python"].apply(lambda x: (x == 1)).sum()

In [None]:
all_data_with_salary.groupby("scraped_at")["Python"].apply(lambda x: (x == 1).sum())

##### Group all top skills

In [None]:
all_data_with_salary.groupby("scraped_at")[top_skills_names].apply(lambda x: (x == 1).sum())

In [None]:
all_data_with_salary.head()

In [None]:
grouped_max_salary = all_data_with_salary.groupby(["scraped_at", "experience"])["max_salary"].mean().reset_index()
# print(grouped_max_salary)
# print(grouped_max_salary["experience"].unique())
plt.figure(figsize=(14, 7))
for seniority in grouped_max_salary["experience"].unique():
    print(seniority)
    subset = grouped_max_salary[grouped_max_salary["experience"] == seniority]
    
    plt.plot(subset["scraped_at"], subset["max_salary"], marker="o")
    plt.xticks(rotation=55)
   
plt.grid(alpha=0.3)
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = all_data_with_salary
df['scraped_at'] = pd.to_datetime(df['scraped_at'])
df = df.sort_values('scraped_at')
daily_means = df.groupby('scraped_at')['min_salary'].mean().reset_index()
daily_mins = df.groupby('scraped_at')['min_salary'].min().reset_index()
daily_maxs = df.groupby('scraped_at')['min_salary'].max().reset_index()

# print(daily_mins)
# print(daily_maxs)

plt.figure(figsize=(12, 6))
plt.plot(daily_means['scraped_at'], 
         daily_means['min_salary'], 
         marker='o', 
         linestyle='-', 
         color='navy',
         label='Min mean salary')

plt.plot(
    daily_mins['scraped_at'],
    daily_mins['min_salary'],
    marker="o",
    linestyle='-',
    color='red',
    label='Min min salary'
)

plt.plot(
    daily_maxs['scraped_at'],
    daily_maxs['min_salary'],
    marker="o",
    linestyle='-',
    color='green',
    label='Min max salary'
)

plt.xticks(rotation=45)
plt.legend()
plt.title('Średnie dzienne wynagrodzenia', pad=20, fontsize=14)
plt.xlabel('Data', labelpad=15)
plt.ylabel('Wynagrodzenie (PLN)', labelpad=15)
plt.grid(True, alpha=0.5)
plt.tight_layout()
plt.show()

grouped = df.groupby(['scraped_at', 'experience'])['min_salary'].mean().reset_index()

plt.figure(figsize=(14, 7))
for seniority in grouped['experience'].unique():
    subset = grouped[grouped['experience'] == seniority]
    plt.plot(subset['scraped_at'], 
             subset['min_salary'], 
             marker='o', 
             label=seniority)

plt.title('Średnie wynagrodzenia z podziałem na seniority')
plt.legend(title='Poziom doświadczenia')
plt.show()


### Predict salaries

#### Without scaler

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [None]:
all_data_with_salary.iloc[:, :12].head(2)

In [None]:
all_data_with_salary.columns[:12]

In [None]:
features = ['location', 'type_of_work', 'experience', 'employment_type', 'operating_mode']

In [None]:
all_data_with_salary[features].isna().sum()

In [None]:
X = all_data_with_salary[features]
X.head()

In [None]:
all_data_with_salary["max_salary"].head()
print(type(all_data_with_salary["max_salary"].head()))

In [None]:
y = all_data_with_salary["max_salary"]

In [None]:
from sklearn.preprocessing import OneHotEncoder
origin_columns = all_data_with_salary[features]
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(origin_columns)
# X_encoded[:20]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42) 

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
y_pred

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Błąd średniokwadratowy: {mse:.2f}')
print(f'Pierwiastek z błędu średniokwadratowego: {rmse:.2f}')
print(f'Błąd średni bezwzględny: {mae:.2f}')
print(f'Współczynnik determinacji (R^2): {r2:.2f}')

In [None]:
original_features = encoder.inverse_transform(X_encoded)
original_features

In [None]:
original_features[:, 2][:10]

In [None]:
plt.scatter(range(len(y_test)), y_test)
plt.scatter(range(len(y_pred)), y_pred)

In [None]:
errors = y_pred - y_test
plt.scatter(range(len(errors)), errors)

#### With PCA = 3 (for testing)

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
features = ['location', 'type_of_work', 'experience', 'employment_type', 'operating_mode']
X = all_data_with_salary[features]
y = all_data_with_salary["max_salary"]

one_hot_encoder = OneHotEncoder()
X_encoded = one_hot_encoder.fit_transform(X)
print(X_encoded.shape)

X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

pca = PCA(n_components=3)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

model = LinearRegression()
model.fit(X_train_pca, y_train)

y_pred = model.predict(X_test_pca)
print(y_test.iloc[:10].values.tolist())
y_pred[:10]

In [None]:
errors = y_pred - y_test
plt.hist(errors, bins=15, edgecolor="black")

In [None]:
plt.scatter(range(len(y_test)), y_test)
plt.scatter(range(len(y_pred)), y_pred)

In [None]:
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Błąd średniokwadratowy: {mse:.2f}')
print(f'Pierwiastek z błędu średniokwadratowego: {rmse:.2f}')
print(f'Błąd średni bezwzględny: {mae:.2f}')
print(f'Współczynnik determinacji (R^2): {r2:.2f}')

#### With Pipelines

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
features

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("onehotencoder", OneHotEncoder(), features)
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
])

X_preprocessed = pipeline.fit_transform(X)
X_preprocessed

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Błąd średniokwadratowy: {mse:.2f}')
print(f'Pierwiastek z błędu średniokwadratowego: {rmse:.2f}')
print(f'Błąd średni bezwzględny: {mae:.2f}')
print(f'Współczynnik determinacji (R^2): {r2:.2f}')

#### Pipeline with PCA

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ("onehotencoder", OneHotEncoder(), features)
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("pca", PCA(n_components=3)),
    ("regressor", LinearRegression()),
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Błąd średniokwadratowy: {mse:.2f}')
print(f'Pierwiastek z błędu średniokwadratowego: {rmse:.2f}')
print(f'Błąd średni bezwzględny: {mae:.2f}')
print(f'Współczynnik determinacji (R^2): {r2:.2f}')

#### Pipeline with cross validation 

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ("onehotencoder", OneHotEncoder(), features)
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("pca", PCA()),
    ("regressor", LinearRegression()),
])

param_grid = {
    "regressor__fit_intercept": [False, True],
    "pca__n_components": list(np.arange(1,5))
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(-grid_search.best_score_)

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Błąd średniokwadratowy: {mse:.2f}')
print(f'Pierwiastek z błędu średniokwadratowego: {rmse:.2f}')
print(f'Błąd średni bezwzględny: {mae:.2f}')
print(f'Współczynnik determinacji (R^2): {r2:.2f}')

In [None]:
from sklearn.linear_model import Ridge, Lasso

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ("onehotencoder", OneHotEncoder(), features)
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("pca", PCA()),
    ("regressor", Ridge()),
])

param_grid = {
    "regressor__fit_intercept": [False, True],
    "pca__n_components": list(np.arange(1,5))
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(-grid_search.best_score_)

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Błąd średniokwadratowy: {mse:.2f}')
print(f'Pierwiastek z błędu średniokwadratowego: {rmse:.2f}')
print(f'Błąd średni bezwzględny: {mae:.2f}')
print(f'Współczynnik determinacji (R^2): {r2:.2f}')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ("onehotencoder", OneHotEncoder(), features)
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("pca", PCA()),
    ("regressor", Lasso()),
])

param_grid = {
    "regressor__fit_intercept": [False, True],
    "pca__n_components": list(np.arange(1,5))
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(-grid_search.best_score_)

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Błąd średniokwadratowy: {mse:.2f}')
print(f'Pierwiastek z błędu średniokwadratowego: {rmse:.2f}')
print(f'Błąd średni bezwzględny: {mae:.2f}')
print(f'Współczynnik determinacji (R^2): {r2:.2f}')

#### Train model RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
# from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ("onehotencoder", OneHotEncoder(), features)
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(random_state=42)),
])

param_grid = {
    "regressor__n_estimators": np.arange(30, 150, 20),
    "regressor__min_samples_leaf": np.arange(1, 5, 2),
    "regressor__n_jobs": [-1],
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)
print(y_pred)

print(f"Best params {grid_search.best_params_}")
print(f"Best estimator {grid_search.best_estimator_}")
print(f"Best score {-grid_search.best_score_}")

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Błąd średniokwadratowy: {mse:.2f}')
print(f'Pierwiastek z błędu średniokwadratowego: {rmse:.2f}')
print(f'Błąd średni bezwzględny: {mae:.2f}')
print(f'Współczynnik determinacji (R^2): {r2:.2f}')

#### TRY WITH XGBoost

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ("onehotencoder", OneHotEncoder(), features)
    ],
    remainder="passthrough"
)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    # ("regressor", XGBRegressor(learning_rate=1, objective='binary:logistic')), <== wrong objective = used for classifier not regression!
    ("regressor", XGBRegressor(objective='reg:squarederror', learning_rate=0.1)),
])

param_grid = {
    "regressor__n_estimators": np.arange(40, 120, 20),
    "regressor__max_depth": np.arange(4, 6, 1),
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='neg_root_mean_squared_error')

grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)
print(y_pred)

mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Błąd średniokwadratowy: {mse:.2f}')
print(f'Pierwiastek z błędu średniokwadratowego: {rmse:.2f}')
print(f'Błąd średni bezwzględny: {mae:.2f}')
print(f'Współczynnik determinacji (R^2): {r2:.2f}')

print(f"Best params {grid_search.best_params_}")
print(f"Best estimator {grid_search.best_estimator_}")
print(f"Best RMSE {-grid_search.best_score_}")

#### Plot skills popularity by dates

In [None]:
pip install pandas matplotlib prophet

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from prophet import Prophet


df = all_data_with_salary.groupby("scraped_at")[top_skills_names].apply(lambda x: (x == 1).sum())
df.index = pd.to_datetime(df.index)
df = df.sort_index()
skills = df.columns

FORECAST_DAYS = 14
forecast_results = {}

for skill in skills:
    skill_df = df[[skill]].reset_index()
    skill_df.columns = ['ds', 'y']
    
    model = Prophet(daily_seasonality=True)
    model.fit(skill_df)
    future = model.make_future_dataframe(periods=FORECAST_DAYS)
    forecast = model.predict(future)
    forecast_results[skill] = forecast

    plt.figure(figsize=(10, 4))
    plt.plot(skill_df['ds'], skill_df['y'], label='Wartości historyczne')
    # plt.plot(forecast['ds'], forecast['yhat'], label='Prognoza')
    plt.fill_between(forecast['ds'], forecast['yhat_lower'], forecast['yhat_upper'], alpha=0.2, label='Przedział ufności')
    plt.title(f"Popularność skilla: {skill}")
    plt.xlabel('Data')
    plt.ylabel('Liczba wystąpień')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


## Inne modele

### Przewidywanie wynagrodzenia na podstawie posiadanych umiejętności w CV

In [None]:
results = []

In [None]:
results

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

results = []
# df = pd.read_csv('job_offers.csv')
df = pd.read_csv('../Data analysis/wszystkie_dane_tylko_z_salary.csv')
# ../Data analysis/wszystkie_dane_tylko_z_salary

def parse_salary(salary):
    if pd.isna(salary) or salary == "Missing salary":
        return np.nan
    salary = salary.replace("PLN/month", "").replace("PLN/h", "").replace("PLN/year", "").replace(" ", "").split("-")
    if len(salary) == 1:
        return int(salary[0])
    return (int(salary[0]) + int(salary[1])) / 2

df['min_salary'] = df['salary'].apply(parse_salary)
print(len(df['min_salary']))
df = df.dropna(subset=['min_salary']).reset_index(drop=True)
print(len(df['min_salary']))
cat_cols = ['type_of_work', 'operating_mode', 'employment_type', 'experience', 'location']
base_cols = ['offer_id', 'title', 'company', 'location', 'salary', 'link', 'scraped_at', 'min_salary'] + cat_cols
skill_cols = [col for col in df.columns if col not in base_cols]

X = df[cat_cols + skill_cols]
y = df['min_salary']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough'  
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
results.append(mae)
print("Mean Absolute Error:", mae)
print(y_pred[:5])
print(y_test[:5])

print(len(y_pred))
print(len(y_test))

# plt.hist(y_pred - y_test, bins=20)
plt.hist(y_pred - y_test, bins=40)

def get_user_input(skill_columns):
    print("\nWprowadź dane dotyczące oferty pracy:")
    data = {}
    # data['type_of_work'] = "Full-time"
    # data['operating_mode'] = "Remote"
    # data['employment_type'] = "B2B"
    # data['experience'] = "Senior"
    # data['location'] = "Warszawa"

    data['type_of_work'] = "Full-time"
    data['operating_mode'] = "Remote"
    data['employment_type'] = "B2B"
    data['experience'] = "Mid"
    data['location'] = "Gliwice"

    print("\nWprowadź wymagane umiejętności, oddzielone przecinkami (np.: .NET C#, Python, ML):")
    user_skills_input = input()
    user_skills = [s.strip().lower() for s in user_skills_input.split(",") if s.strip()]

    print("user_skills_input", user_skills_input)
    
    skills = {}
    for col in skill_columns:
        skills[col] = 1 if col.lower() in user_skills else 0

    # print("skills", skills)
    
    data.update(skills)
    return pd.DataFrame([data])

user_input = get_user_input(skill_cols)
predicted_salary = model.predict(user_input)
print("\nExpected salary (default parameters):", predicted_salary[0])

In [None]:
# fastapi, TensorFlow, Python, Machine Learning, Flask, Django, Docker, Cloud, AWS, API
# AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript 

#### Random Forest Regressor

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

results = []
df = pd.read_csv('../Data analysis/wszystkie_dane_tylko_z_salary.csv')

def parse_salary(salary):
    if pd.isna(salary) or salary == "Missing salary":
        return np.nan
    salary = salary.replace("PLN/month", "").replace("PLN/h", "").replace("PLN/year", "").replace(" ", "").split("-")
    if len(salary) == 1:
        return int(salary[0])
    return (int(salary[0]) + int(salary[1])) / 2

df['min_salary'] = df['salary'].apply(parse_salary)
df = df.dropna(subset=['min_salary']).reset_index(drop=True)
cat_cols = ['type_of_work', 'operating_mode', 'employment_type', 'experience', 'location']
base_cols = ['offer_id', 'title', 'company', 'location', 'salary', 'link', 'scraped_at', 'min_salary'] + cat_cols
skill_cols = [col for col in df.columns if col not in base_cols]

X = df[cat_cols + skill_cols]
y = df['min_salary']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough' 
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

grid_params = {
    "regressor__n_estimators": np.arange(25, 100, 25),
    "regressor__criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
    # "regressor__criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search = GridSearchCV(model, param_grid=grid_params, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_.predict(X_test))

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
results.append(mae)

def get_user_input(skill_columns):

    print("\nWprowadź dane dotyczące oferty pracy:")
    data = {}
    # data['type_of_work'] = "Full-time"
    # data['operating_mode'] = "Remote"
    # data['employment_type'] = "B2B"
    # data['experience'] = "Senior"
    # data['location'] = "Warszawa"

    data['type_of_work'] = "Full-time"
    data['operating_mode'] = "Remote"
    data['employment_type'] = "B2B"
    data['experience'] = "Mid"
    data['location'] = "Gliwice"

    # print("\nWprowadź wymagane umiejętności, oddzielone przecinkami (np.: .NET C#, Python, ML):")
    # user_skills_input = input()
    user_skills_input = " AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript"
    user_skills = [s.strip().lower() for s in user_skills_input.split(",") if s.strip()]
    skills = {}
    for col in skill_columns:
        skills[col] = 1 if col.lower() in user_skills else 0

    data.update(skills)
    return pd.DataFrame([data])

user_input = get_user_input(skill_cols)
predicted_salary = model.predict(user_input)
print("\nExpected salary (default parameters):", predicted_salary[0])

grid_search_predicted_salary = grid_search.best_estimator_.predict(user_input)
print("\ngrid_search_predicted_salary (PLN/miesiąc):", grid_search_predicted_salary[0])

In [None]:
# fastapi, TensorFlow, Python, Machine Learning, Flask, Django, Docker, Cloud, AWS, API
# AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript 

In [None]:
grid_search_predicted_salary = grid_search.best_estimator_.predict(user_input)
print("\ngrid_search_predicted_salary (PLN/miesiąc):", grid_search_predicted_salary[0])

#### LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

results = []
df = pd.read_csv('../Data analysis/wszystkie_dane_tylko_z_salary.csv')

def parse_salary(salary):
    if pd.isna(salary) or salary == "Missing salary":
        return np.nan
    salary = salary.replace("PLN/month", "").replace("PLN/h", "").replace("PLN/year", "").replace(" ", "").split("-")
    if len(salary) == 1:
        return int(salary[0])
    return (int(salary[0]) + int(salary[1])) / 2

df['min_salary'] = df['salary'].apply(parse_salary)
df = df.dropna(subset=['min_salary']).reset_index(drop=True)
cat_cols = ['type_of_work', 'operating_mode', 'employment_type', 'experience', 'location']
base_cols = ['offer_id', 'title', 'company', 'location', 'salary', 'link', 'scraped_at', 'min_salary'] + cat_cols
skill_cols = [col for col in df.columns if col not in base_cols]

X = df[cat_cols + skill_cols]
y = df['min_salary']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough' 
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

grid_params = {
    "regressor__fit_intercept": [True, False],
    # "regressor__criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
    # "regressor__criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search = GridSearchCV(model, param_grid=grid_params, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_.predict(X_test))

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
results.append(mae)

def get_user_input(skill_columns):
    print("\nWprowadź dane dotyczące oferty pracy:")
    data = {}
    # data['type_of_work'] = "Full-time"
    # data['operating_mode'] = "Remote"
    # data['employment_type'] = "B2B"
    # data['experience'] = "Senior"
    # data['location'] = "Warszawa"

    data['type_of_work'] = "Full-time"
    data['operating_mode'] = "Remote"
    data['employment_type'] = "B2B"
    data['experience'] = "Mid"
    data['location'] = "Gliwice"

    user_skills_input = " AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript"
    user_skills = [s.strip().lower() for s in user_skills_input.split(",") if s.strip()]

    skills = {}
    for col in skill_columns:
        skills[col] = 1 if col.lower() in user_skills else 0

    data.update(skills)
    return pd.DataFrame([data])

user_input = get_user_input(skill_cols)

predicted_salary = model.predict(user_input)
print("\nExpected salary (default parameters):", predicted_salary[0])

grid_search_predicted_salary = grid_search.best_estimator_.predict(user_input)
print("\ngrid_search_predicted_salary (PLN/miesiąc):", grid_search_predicted_salary[0])

#### KNN Regressor

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

results = []
df = pd.read_csv('../Data analysis/wszystkie_dane_tylko_z_salary.csv')

def parse_salary(salary):
    if pd.isna(salary) or salary == "Missing salary":
        return np.nan
    salary = salary.replace("PLN/month", "").replace("PLN/h", "").replace("PLN/year", "").replace(" ", "").split("-")
    if len(salary) == 1:
        return int(salary[0])
    return (int(salary[0]) + int(salary[1])) / 2

df['min_salary'] = df['salary'].apply(parse_salary)
df = df.dropna(subset=['min_salary']).reset_index(drop=True)
cat_cols = ['type_of_work', 'operating_mode', 'employment_type', 'experience', 'location']
base_cols = ['offer_id', 'title', 'company', 'location', 'salary', 'link', 'scraped_at', 'min_salary'] + cat_cols
skill_cols = [col for col in df.columns if col not in base_cols]

X = df[cat_cols + skill_cols]
y = df['min_salary']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
    ],
    remainder='passthrough' 
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

grid_params = {
    "regressor__n_neighbors": np.arange(5, 20, 5),
    "regressor__algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
    # "regressor__criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
    # "regressor__criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
grid_search = GridSearchCV(model, param_grid=grid_params, cv=5, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
print(grid_search.best_estimator_.predict(X_test))

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
results.append(mae)
# print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, y_pred))
# print("Mean Squared Error (MSE):", mean_squared_error(y_test, y_pred))
# print("Root Mean Squared Error (RMSE):", np.sqrt(mean_squared_error(y_test, y_pred)))
# print("R2 Score:", r2_score(y_test, y_pred))
# plt.hist(y_pred - y_test, bins=40)

def get_user_input(skill_columns):
    print("\nWprowadź dane dotyczące oferty pracy:")
    data = {}
    data['type_of_work'] = "Full-time"
    data['operating_mode'] = "Remote"
    data['employment_type'] = "B2B"
    data['experience'] = "Mid"
    data['location'] = "Gliwice"

    user_skills_input = " AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript"
    user_skills = [s.strip().lower() for s in user_skills_input.split(",") if s.strip()]

    skills = {}
    for col in skill_columns:
        skills[col] = 1 if col.lower() in user_skills else 0

    data.update(skills)
    return pd.DataFrame([data])

user_input = get_user_input(skill_cols)

predicted_salary = model.predict(user_input)
print("\nExpected salary (default parameters):", predicted_salary[0])

grid_search_predicted_salary = grid_search.best_estimator_.predict(user_input)
print("\ngrid_search_predicted_salary (PLN/miesiąc):", grid_search_predicted_salary[0])


# Mean Absolute Error (MAE): 5274.05638888889
# Mean Squared Error (MSE): 51627118.65147361
# Root Mean Squared Error (RMSE): 7185.201364713003
# R2 Score: 0.20971367245488715
# Expected salary (default parameters): 14769.95

### One Ring to rule them all, One Ring to find them, One Ring to bring them all and in the darkness bind them

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
all_data_with_salary = pd.read_csv("../Data analysis/wszystkie_dane_tylko_z_salary.csv")
all_data_with_salary.head(2)

In [None]:
print(all_data_with_salary.shape)
all_data_with_salary.dropna()
print(all_data_with_salary.shape)

In [None]:
print(all_data_with_salary.columns[:12])

operation_cols = ['location', 'type_of_work', 'experience', 'employment_type', 'operating_mode']
skill_cols = all_data_with_salary.columns[12:].tolist() 

X = all_data_with_salary[operation_cols + skill_cols]
y = all_data_with_salary.min_salary
y.head()

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(), operation_cols)
    ],
    remainder="passthrough"
)

MODEL_MAP = {
    "RandomForestRegressor": RandomForestRegressor,
    "LinearRegression": LinearRegression,
    "KNeighborsRegressor": KNeighborsRegressor
}

models = [
    {
        "model": "RandomForestRegressor",
        "params": {
            "regressor__n_estimators": np.arange(50, 100, 50),
            "regressor__criterion": ["squared_error"]
            # "regressor__criterion": ["squared_error", "absolute_error", "friedman_mse", "poisson"]
        }
    },

    {
        "model": "LinearRegression",
        "params": {
            "regressor__fit_intercept": [True, False],
        }
    },

    {
        "model": "KNeighborsRegressor",
        "params": {
            "regressor__n_neighbors": np.arange(5, 10, 5),
            "regressor__algorithm": ["auto", "ball_tree"]
            # "regressor__algorithm": ["auto", "ball_tree", "kd_tree", "brute"]
        }
    },
]

for model_config in models:  
    print(f"\n{'='*30}\nTesting {model_config['model']}\n{'='*30}")

    pipeline = Pipeline(
        steps=[
            ("preprocessor", preprocessor),
            ("regressor", MODEL_MAP[model_config["model"]]()) 
        ]
    )

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)
    grid_search_cv = GridSearchCV(estimator=pipeline, param_grid=model_config["params"], cv=5, scoring='neg_root_mean_squared_error')
    grid_search_cv.fit(X_train, y_train)
    print(model_config["model"])
    print(grid_search_cv.best_estimator_)


### Jobs categories classification: Data Scientist, Machine Learning Engineer, Software Engineer / Developer, Manager / Director / Lead, Other

#### ADD!!! Add classified titles and drop duplicates (drop scratched_at and offer_id) 

In [None]:
all_data_df = pd.read_csv("../Data analysis/wszystkie_dane_zlaczone_z_i_bez_salary.csv")

In [None]:
all_data_df.head(3)

In [None]:
skills_classified_title_df = pd.read_csv("../Data analysis/skills_classified_title_only.csv")
skills_classified_title_df.head(3)

In [None]:
all_data_df.shape

In [None]:
skills_classified_title_df.shape

In [None]:
len(all_data_df.columns)

In [None]:
all_data_df.insert(0, "skills_classified_title", skills_classified_title_df.iloc[:, 0])

In [None]:
len(all_data_df.columns)

In [None]:
all_data_df.columns

In [None]:
all_data_df.head(3)

In [None]:
all_data_df.drop(columns=["scraped_at", "offer_id"], inplace=True)
all_data_df[:3]

In [None]:
copied_all_data_df = all_data_df.copy()
copied_all_data_df[:3]

In [None]:
copied_all_data_df.drop_duplicates(inplace=True)
print(copied_all_data_df.shape)

In [None]:
import pandas as pd
# data_with_jobs_classified = pd.read_csv("dane_sklasyfikowane.csv")
data_with_jobs_classified = copied_all_data_df.copy() 
# data_with_jobs_classified.head(3)
print(len(data_with_jobs_classified))
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "Other"]
print(len(data_with_jobs_classified))

In [None]:
data_with_jobs_classified["skills_classified_title"].head()

In [None]:
skill_cols = data_with_jobs_classified.iloc[:, 9:]
skill_cols.head(3)

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

In [None]:
type(skill_cols)

In [None]:
X = skill_cols
y = data_with_jobs_classified["skills_classified_title"]
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

In [None]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

In [None]:
print(set(y_train_encoded))
print(label_encoder.classes_)
for i, category in enumerate(label_encoder.classes_):
    print(i, category)

#### Version with pipeline

In [None]:
pipeline = Pipeline(
    steps=[
        ("model", LogisticRegression())
    ]
)

pipeline.fit(X_train, y_train_encoded)
y_pred = pipeline.predict(X_test)
print(f"{y_pred[6:20]}\n{y_test_encoded[6:20]}")
from sklearn.metrics import accuracy_score
accuracy_score(y_test_encoded, y_pred)

#### Version with GridSearchCV

In [None]:
unique, counts = np.unique(y_train_encoded, return_counts=True)
print(unique, counts)

In [None]:
pipeline = Pipeline(
    steps=[
        ("model", LogisticRegression())
    ]
)

param_grid = {
    "model__penalty": ['l2', "elasticnet"],
    # "penalty": ['l1', 'l2', 'elasticnet', None],
    "model__C": [0.001, 0.01, 0.1, 1, 10],
    "model__max_iter": np.arange(50, 300, 100),
    "model__n_jobs": [-1],
    # "model__solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
    "model__solver": ['saga'],
    "model__l1_ratio": np.arange(0, 1, 0.1),
    # "model__max_iter": np.arange(100, 1000, 300)
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=2)
grid_search.fit(X_train, y_train_encoded)
print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.classes_)
y_pred = grid_search.best_estimator_.predict(X_test)

accuracy_score(y_test_encoded, y_pred)

#### Combined model code

In [None]:
import pandas as pd
# data_with_jobs_classified = pd.read_csv("dane_sklasyfikowane.csv")
data_with_jobs_classified = copied_all_data_df.copy()
data_with_jobs_classified.head(3)

In [None]:
skill_cols = data_with_jobs_classified.iloc[:, 9:]

In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
X = skill_cols
y = data_with_jobs_classified["skills_classified_title"]
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.2, random_state=42)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit_transform(y_test)

unique, counts = np.unique(y_train_encoded, return_counts=True)
print(unique, counts)

pipeline = Pipeline(
    steps=[
        ("model", LogisticRegression())
    ]
)

param_grid = {
    "model__penalty": ["elasticnet"],
    # "penalty": ['l1', 'l2', 'elasticnet', None],
    "model__C": [0.1, 1, 0.3],
    "model__max_iter": np.arange(150, 300, 150),
    "model__n_jobs": [-1],
    # "model__solver": ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
    "model__solver": ['saga'],
    "model__l1_ratio": np.arange(0.1, 1, 0.3),
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=2)
grid_search.fit(X_train, y_train_encoded)
print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.classes_)
y_pred = grid_search.best_estimator_.predict(X_test)
accuracy_score(y_test_encoded, y_pred)


In [None]:
# fastapi, TensorFlow, Python, Machine Learning, Flask, Django, Docker, Cloud, AWS, API
# AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript 

In [None]:
# skills_classified_title

In [None]:
import pandas as pd
# data_with_jobs_classified = pd.read_csv("dane_sklasyfikowane.csv")
data_with_jobs_classified = copied_all_data_df.copy()
# data_with_jobs_classified.head(3)
print(len(data_with_jobs_classified))
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "Other"]
print(len(data_with_jobs_classified))

#### Model that takes input from user (testing version)

In [None]:
skill_cols = data_with_jobs_classified.iloc[:, 9:]
# print(skill_cols)
# type(skill_cols)

In [None]:
# fastapi, TensorFlow, Python, Machine Learning, Flask, Django, Docker, Cloud, AWS, API
user_skills = ["flask", "django", "aws", "cloud", "api"]
base_skills = ["flask", "aws", "cloud"]

dictionary = {}

for col in user_skills:
    dictionary[col] = 1 if col in base_skills else 0

print("dictionary", dictionary)
print(user_skills)

skillls = [skill.capitalize() for skill in user_skills if skill not in base_skills]
print(skillls)


In [None]:
data_with_jobs_classified.shape

In [None]:
data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] == "AI Product Manager"]

In [None]:
data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "AI Product Manager"].head(3)

In [None]:
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "AI Product Manager"]

In [None]:
data_with_jobs_classified.shape

In [None]:
data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] == "AI Product Manager"]

In [None]:
data_with_jobs_classified = copied_all_data_df.copy()
data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] == "AI Product Manager"]

In [None]:
data_with_jobs_classified["skills_classified_title"].value_counts()

In [None]:
data_with_jobs_classified["skills_classified_title"].value_counts()

In [None]:
counts = data_with_jobs_classified["skills_classified_title"].value_counts()
counts
# values_to_keep = 

In [None]:
# TESTING SECTION
data_with_jobs_classified = copied_all_data_df.copy()
counts = data_with_jobs_classified["skills_classified_title"].value_counts()
print(counts)
values_to_keep = counts[counts > 1].index
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"].isin(values_to_keep)]
print(values_to_keep)
data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] == "AI Product Manager"]

In [None]:
values_to_keep = counts[counts > 1]
values_to_keep

In [None]:
values_to_keep = counts[counts > 1].index
values_to_keep

In [None]:
copied_all_data_df

In [None]:
data_with_jobs_classified = copied_all_data_df.copy()
print(len(data_with_jobs_classified))
counts = data_with_jobs_classified["skills_classified_title"].value_counts()
# print(counts)
values_to_keep = counts[counts > 1].index
print(values_to_keep)
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"].isin(values_to_keep)]
data_with_jobs_classified

In [None]:
# data_with_jobs_classified = pd.read_csv("dane_sklasyfikowane.csv")

# data_with_jobs_classified = copied_all_data_df.copy()
# data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"].isin(values_to_keep)]

# data_with_jobs_classified = copied_all_data_df.copy()
# print(len(data_with_jobs_classified))
# counts = data_with_jobs_classified["skills_classified_title"].value_counts()
# # print(counts)
# values_to_keep = counts[counts > 1].index
# data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"].isin(values_to_keep)]
# # print(values_to_keep)
# # data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] == "AI Product Manager"]

# TESTING SECTION
data_with_jobs_classified = copied_all_data_df.copy()
counts = data_with_jobs_classified["skills_classified_title"].value_counts()
print(counts)
values_to_keep = counts[counts > 1].index
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"].isin(values_to_keep)]
print(values_to_keep)
data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] == "AI Product Manager"]

# data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "AI Product Manager"]
# data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "ML Operations Engineer"]
# data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "Data Architect"]
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "Other"]


skill_cols = data_with_jobs_classified.iloc[:, 9:]
if not isinstance(skill_cols, list):
    print(f"skill_cols nie jest listą nazw kolumn! Zmiana z {type(skill_cols)} na listę!")
    skill_cols = list(skill_cols)  # albo ręcznie wpisz nazwy

print("len(skill_cols)", len(skill_cols))
print("len(data_with_jobs_classified)", len(data_with_jobs_classified))
X = data_with_jobs_classified[skill_cols]

def select_skills(data, available_skills):
    selected_names = input("Podaj nazwy skilli, oddzielone przecinkami (np. Python,SQL,Excel): ")
    selected_skills = [name.strip() for name in selected_names.split(",")]
    not_in_db_skills = [skill for skill in selected_skills if skill not in available_skills]
    print("not_in_db_skills", not_in_db_skills)
        
    print(f"Wybrane skille: {selected_skills}")
    return selected_skills

X = data_with_jobs_classified[skill_cols]
y = data_with_jobs_classified["skills_classified_title"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

pipeline = Pipeline(
    steps=[
        ("model", LogisticRegression())
    ]
)

param_grid = {
    "model__penalty": ["elasticnet"],
    "model__C": [0.1, 1, 0.3],
    "model__max_iter": np.arange(150, 300, 150),
    "model__n_jobs": [-1],
    "model__solver": ['saga'],
    "model__l1_ratio": np.arange(0.1, 1, 0.3),
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=2)
grid_search.fit(X_train, y_train_encoded)

print("Model gotowy!")

# Pobranie skilli do predykcji
selected_skills = select_skills(data_with_jobs_classified, skill_cols)

if selected_skills:
    # Utworzenie wektora wejściowego
    input_vector = []
    for skill in skill_cols:
        if skill in selected_skills:
            input_vector.append(1)  # Skill obecny
        else:
            input_vector.append(0)  # Skill nieobecny

    print("input_vector", input_vector)
    print("len(input_vector)", len(input_vector))
    
    import numpy as np
    # input_array = np.array(input_vector).reshape(1, -1)
    input_array = pd.DataFrame([input_vector], columns=skill_cols)
    print("input_array", input_array)

    # Predykcja
    predicted_label_encoded = grid_search.best_estimator_.predict(input_array)[0]
    predicted_label = label_encoder.inverse_transform([predicted_label_encoded])[0]
    print(f"➡️ Przewidywana kategoria encoded: {predicted_label_encoded}")
    print(f"➡️ Przewidywana kategoria: {predicted_label}")
else:
    print("Nie podano poprawnych skilli. Koniec.")

# Flask, Django, Docker, Cloud, AWS, API
# input_vector [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript     
# input_vector [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

In [None]:
# fastapi, TensorFlow, Python, Machine Learning, Flask, Django, Docker, Cloud, AWS, API
# AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript 

#### Model that takes input from user (final version)

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score

# data_with_jobs_classified = pd.read_csv("dane_sklasyfikowane.csv")
# data_with_jobs_classified = copied_all_data_df.copy()
# data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "Other"]

# TESTING SECTION
data_with_jobs_classified = copied_all_data_df.copy()
counts = data_with_jobs_classified["skills_classified_title"].value_counts()
print(counts)
values_to_keep = counts[counts > 1].index
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"].isin(values_to_keep)]
print(values_to_keep)
data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] == "AI Product Manager"]


skill_cols = data_with_jobs_classified.iloc[:, 9:]

if not isinstance(skill_cols, list):
    print(f"skill_cols is not a list of skill names! Zmiana z {type(skill_cols)} na listę!")
    skill_cols = list(skill_cols)  

# X = data_with_jobs_classified[skill_cols]

def select_skills(data, available_skills):
    # selected_names = input("Please write skills, separated by commas (e.g.. Python,SQL,Excel): ")
    selected_names = "Flask, Django, Docker, Cloud, AWS, API"
    selected_skills = [name.strip() for name in selected_names.split(",")]
    not_in_db_skills = [skill for skill in selected_skills if skill not in available_skills]
    return selected_skills

X = data_with_jobs_classified[skill_cols]
y = data_with_jobs_classified["skills_classified_title"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

pipeline = Pipeline(
    steps=[
        ("model", LogisticRegression())
    ]
)

param_grid = {
    "model__penalty": ["elasticnet"],
    "model__C": [0.1, 1, 0.3],
    "model__max_iter": np.arange(150, 300, 150),
    "model__n_jobs": [-1],
    "model__solver": ['saga'],
    "model__l1_ratio": np.arange(0.1, 1, 0.3),
}

grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, n_jobs=-1, cv=2)
grid_search.fit(X_train, y_train_encoded)
y_pred = grid_search.best_estimator_.predict(X_test)
print("Accuracy score", accuracy_score(y_test_encoded, y_pred))

print("Model trained and ready to roll!")

selected_skills = select_skills(data_with_jobs_classified, skill_cols)

# if selected_skills:
#     input_vector = []
#     for skill in skill_cols:
#         if skill in selected_skills:
#             input_vector.append(1)
#         else:
#             input_vector.append(0)

#     # Tworzymy DataFrame z nazwami kolumn
#     input_df = pd.DataFrame([input_vector], columns=skill_cols)

#     predicted_label_encoded = grid_search.best_estimator_.predict(input_df)[0]
#     predicted_label = label_encoder.inverse_transform([predicted_label_encoded])[0]
#     print(f"Predicted category encoded: {predicted_label_encoded}")
#     print(f"Predicted category: {predicted_label}")
# else:
#     print("You don't write any skill. End program.")

if selected_skills:
    input_vector = []
    for skill in skill_cols:
        if skill in selected_skills:
            input_vector.append(1) 
        else:
            input_vector.append(0)
    
    print("input_vector", input_vector)
    # input_array = np.array(input_vector).reshape(1, -1)
    input_array = pd.DataFrame([input_vector], columns=skill_cols)
    print("input_array", input_array)
    predicted_label_encoded = grid_search.best_estimator_.predict(input_array)[0]
    predicted_label = label_encoder.inverse_transform([predicted_label_encoded])[0]
    print(f"Predicted category encoded: {predicted_label_encoded}")
    print(f"Predicted category: {predicted_label}")
else:
    print("You don't write any skill. End program.")

# Flask, Django, Docker, Cloud, AWS, API
# input_vector [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript     
# input_vector [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

In [None]:
results = []

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC, NuSVC, SVC

# data_with_jobs_classified = pd.read_csv("dane_sklasyfikowane.csv")
# data_with_jobs_classified = copied_all_data_df.copy()
# data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "Other"]

# TESTING SECTION
data_with_jobs_classified = copied_all_data_df.copy()
counts = data_with_jobs_classified["skills_classified_title"].value_counts()
print(counts)
values_to_keep = counts[counts > 1].index
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"].isin(values_to_keep)]
print(values_to_keep)
# data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] == "AI Product Manager"]

skill_cols = data_with_jobs_classified.iloc[:, 9:]

if not isinstance(skill_cols, list):
    print(f"skill_cols is not a list of skill names! Zmiana z {type(skill_cols)} na listę!")
    skill_cols = list(skill_cols)  

X = data_with_jobs_classified[skill_cols]

def select_skills(data, available_skills):
    # Pandas, Numpy, Matplotlib, Data, Analyzing, Data Storage
    # selected_names = input("Please write skills, separated by commas (e.g.. Python,SQL,Excel): ")
    selected_names = "Pandas, Numpy, Matplotlib, Data, Analyzing, Data Storage"
    print("selected_names", selected_names)
    print("type(selected_names)", type(selected_names))
    selected_skills = [name.strip() for name in selected_names.split(",")]
    not_in_db_skills = [skill for skill in selected_skills if skill not in available_skills]
    return selected_skills

X = data_with_jobs_classified[skill_cols]
y = data_with_jobs_classified["skills_classified_title"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

MODELS_MAP = {
    "LogisticRegression": LogisticRegression,
    "RandomForestClassifier": RandomForestClassifier,
    "KNeighborsClassifier": KNeighborsClassifier,
    "XGBClassifier": XGBClassifier,
    "XGBRFClassifier": XGBRFClassifier,
    "LinearSVC": LinearSVC,
    # "NuSVC": NuSVC  <== VERIFY IN FUTURE!!!
    "SVC": SVC
}

models = [
   {
        "classifier": MODELS_MAP["LogisticRegression"],
        "params": {
            "model__penalty": ["elasticnet"],
            "model__C": [0.1, 1, 0.3],
            "model__max_iter": np.arange(150, 300, 150),
            "model__n_jobs": [-1],
            "model__solver": ['saga'],
            "model__l1_ratio": np.arange(0.1, 1, 0.3),
        }
    },
    {
        "classifier": MODELS_MAP["RandomForestClassifier"],
        "params": {
            'model__n_estimators': [100, 200, 100],
            'model__max_depth': [10, 20, None],
            'model__min_samples_split': [2, 3, 4, 5],
            'model__min_samples_leaf': [1, 2, 4],
            'model__max_features': ['sqrt', 'log2']
        }
    },
    {
        "classifier": MODELS_MAP["KNeighborsClassifier"],
        "params": {
            'model__n_neighbors': [3, 5, 10, 15],
            'model__weights': ['uniform', 'distance'],
            'model__metric': ['minkowski', 'euclidean'],
            'model__p': [1, 2],
            'model__algorithm': ['auto', 'ball_tree']
        }
    },
    {
        "classifier": MODELS_MAP["XGBClassifier"],
        "params": {
            'model__n_estimators': [100, 200],
            'model__max_depth': [3, 5],
            'model__learning_rate': [0.05, 0.1],
            'model__subsample': [0.8, 1.0],
            'model__colsample_bytree': [0.8, 1.0],
            # 'model__reg_alpha': [0, 0.1, 1, 10],
            # 'model__reg_lambda': [0, 0.1, 1, 10]
            'model__reg_alpha': [0, 0.1],
            'model__reg_lambda': [0, 0.1]
        }
    },
    {
        "classifier": MODELS_MAP["XGBRFClassifier"],
        "params": {
            'model__n_estimators': [200, 300],
            'model__max_depth': [5, 7, 10],
            'model__subsample': [0.8, 1.0],
            'model__colsample_bytree': [0.8, 1.0],
            'model__reg_alpha': [0, 0.1],
            'model__reg_lambda': [1, 10]
        }
    },
    {
        "classifier": MODELS_MAP["LinearSVC"],
        "params": {
            'model__C': [0.01, 0.1, 1, 10],
            'model__loss': ['hinge', 'squared_hinge'],
            'model__penalty': ['l2'],
            'model__dual': [True],
            'model__max_iter': [1000, 5000]
        }
    },
    {
        "classifier": MODELS_MAP["SVC"],
        "params": {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf', 'poly'],
            'model__gamma': ['scale', 'auto', 0.1],
            'model__degree': [2, 3],
            'model__coef0': [0.0, 0.5]
        }
    },
    # {
    #     "classifier": MODELS_MAP["NuSVC"],
    #     "params": {
    #         'model__nu': [0.1, 0.3, 0.5],
    #         'model__kernel': ['linear', 'rbf', 'poly'],
    #         'model__gamma': ['scale', 'auto', 0.1],
    #         'model__degree': [2, 3],
    #         'model__coef0': [0.0, 0.5]
    #     }
    # },
    
]

pipeline = Pipeline(
    steps=[
        ("model", models[0]["classifier"]())
    ]
)

grid_search = GridSearchCV(estimator=pipeline, param_grid=models[0]["params"], n_jobs=-1, cv=2, verbose=2)
grid_search.fit(X_train, y_train_encoded)
# y_pred = grid_search.best_estimator_.predict(X_test)
# print("Accuracy score", accuracy_score(y_test_encoded, y_pred))

print("Model trained and ready to roll!")

selected_skills = select_skills(data_with_jobs_classified, skill_cols)

if selected_skills:
    input_vector = []
    for skill in skill_cols:
        if skill in selected_skills:
            input_vector.append(1)
        else:
            input_vector.append(0)
    
    # input_array = np.array(input_vector).reshape(1, -1)
    input_array = pd.DataFrame([input_vector], columns=skill_cols)
    predicted_label_encoded = grid_search.best_estimator_.predict(input_array)[0]
    predicted_label = label_encoder.inverse_transform([predicted_label_encoded])[0]
    print(f"Predicted category encoded: {predicted_label_encoded}")
    print(f"Predicted category: {predicted_label}")
else:
    print("You don't write any skill. End program.")

print("Accuracy score", accuracy_score(y_test_encoded, y_pred))
# Flask, Django, Docker, Cloud, AWS, API
# input_vector [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript     
# input_vector [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

# Pandas, Numpy, Matplotlib, Data, Analyzing, Data Storage
# Accuracy score 0.9920634920634921

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC, NuSVC, SVC

results = []

# data_with_jobs_classified = pd.read_csv("dane_sklasyfikowane.csv")
# data_with_jobs_classified = copied_all_data_df.copy()
# data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "Other"]

data_with_jobs_classified = copied_all_data_df.copy()
counts = data_with_jobs_classified["skills_classified_title"].value_counts()
print(counts)
values_to_keep = counts[counts > 1].index
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"].isin(values_to_keep)]
print(values_to_keep)

skill_cols = data_with_jobs_classified.iloc[:, 9:]

if not isinstance(skill_cols, list):
    print(f"skill_cols is not a list of skill names! Zmiana z {type(skill_cols)} na listę!")
    skill_cols = list(skill_cols)  

X = data_with_jobs_classified[skill_cols]

def select_skills(data, available_skills):
    # Pandas, Numpy, Matplotlib, Data, Analyzing, Data Storage
    # selected_names = input("Please write skills, separated by commas (e.g.. Python,SQL,Excel): ")
    selected_names = "Pandas, Numpy, Matplotlib, Data, Analyzing, Data Storage"
    print("selected_names", selected_names)
    print("type(selected_names)", type(selected_names))
    selected_skills = [name.strip() for name in selected_names.split(",")]
    not_in_db_skills = [skill for skill in selected_skills if skill not in available_skills]
    return selected_skills

X = data_with_jobs_classified[skill_cols]
y = data_with_jobs_classified["skills_classified_title"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# print(type(y_train))
# print(type(y_test))
# original_columns_names = set(pd.concat([y_train, y_test]))
# print("original_columns_names", original_columns_names)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
# print("label_encoder", label_encoder.classes_)
original_labels = []
for i, label in enumerate(label_encoder.classes_):
    print(f" {i}: {label}")
    original_labels.append({label: i})

# print("original_labels", original_labels)

MODELS_MAP = {
    "LogisticRegression": LogisticRegression,
    "RandomForestClassifier": RandomForestClassifier,
    "KNeighborsClassifier": KNeighborsClassifier,
    "XGBClassifier": XGBClassifier,
    "XGBRFClassifier": XGBRFClassifier,
    "LinearSVC": LinearSVC,
    "SVC": SVC
}

models = [
   {
        "classifier": "LogisticRegression",
        "params": {
            "model__penalty": ["elasticnet"],
            "model__C": [0.1, 1, 0.3],
            "model__max_iter": np.arange(150, 300, 150),
            "model__n_jobs": [-1],
            "model__solver": ['saga'],
            "model__l1_ratio": np.arange(0.1, 1, 0.3),
        }
    },
    {
        "classifier": "RandomForestClassifier",
        "params": {
            'model__n_estimators': [100, 200, 100],
            # 'model__max_depth': [10, 20, None],
            'model__max_depth': [10, None],
            'model__min_samples_split': [2, 3],
            # 'model__min_samples_split': [2, 3, 4, 5],
            'model__min_samples_leaf': [1, 2, 4],
            'model__max_features': ['sqrt', 'log2']
        }
    },
    {
        "classifier": "KNeighborsClassifier",
        "params": {
            'model__n_neighbors': [3, 5, 10, 15],
            'model__weights': ['uniform', 'distance'],
            'model__metric': ['minkowski', 'euclidean'],
            'model__p': [1, 2],
            'model__algorithm': ['auto', 'ball_tree']
        }
    },
    {
        "classifier": "XGBClassifier",
        "params": {
            'model__n_estimators': [100, 200],
            'model__max_depth': [3, 5],
            'model__learning_rate': [0.05, 0.1],
            'model__subsample': [0.8, 1.0],
            'model__colsample_bytree': [0.8, 1.0],
            # 'model__reg_alpha': [0, 0.1, 1, 10],
            # 'model__reg_lambda': [0, 0.1, 1, 10]
            'model__reg_alpha': [0, 0.1],
            'model__reg_lambda': [0, 0.1]
        }
    },
    {
        "classifier": "XGBRFClassifier",
        "params": {
            'model__n_estimators': [200, 300],
            'model__max_depth': [5, 7, 10],
            'model__subsample': [0.8, 1.0],
            'model__colsample_bytree': [0.8, 1.0],
            'model__reg_alpha': [0, 0.1],
            'model__reg_lambda': [1, 10]
        }
    },
    {
        "classifier": "LinearSVC",
        "params": {
            'model__C': [0.01, 0.1, 1, 10],
            'model__loss': ['hinge', 'squared_hinge'],
            'model__penalty': ['l2'],
            'model__dual': [True],
            'model__max_iter': [1000, 5000]
        }
    },
    {
        "classifier": "SVC",
        "params": {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf', 'poly'],
            'model__gamma': ['scale', 'auto', 0.1],
            'model__degree': [2, 3],
            'model__coef0': [0.0, 0.5]
        }
    },
]

stop_flag = 0
for model_config in models:
    stop_flag = stop_flag + 1
    if stop_flag == 2:
        break;
    
    pipeline = Pipeline(
        steps=[
            ("model", MODELS_MAP[model_config["classifier"]]())
        ]
    )
    
    grid_search = GridSearchCV(estimator=pipeline, param_grid=model_config["params"], n_jobs=-1, cv=2, verbose=0)
    grid_search.fit(X_train, y_train_encoded)
    y_pred = grid_search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    classification_scores_dict = classification_report(y_test_encoded, y_pred, output_dict=True)
    classification_scores_dict_df = pd.DataFrame(classification_scores_dict).transpose()
    # print("classification_scores_dict_df", classification_scores_dict_df)
    # print("report_df", report_df)
    print("Accuracy score", accuracy)
    classes = np.unique(y_test_encoded)
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for c in classes:
        y_true_binary = (y_test_encoded == c).astype(int)
        y_pred_binary = (y_test_encoded == c).astype(int)
        
        precision = precision_score(y_true_binary, y_pred_binary)
        precision_scores.append(round(precision, 3))
        recall = recall_score(y_true_binary, y_pred_binary)
        recall_scores.append(round(recall, 3))
        f1 = f1_score(y_true_binary, y_pred_binary)
        f1_scores.append(round(f1, 3))
        
        # print(f"Klasa: {c}")
        # print(f"Precision: {precision}")
        # print(f"Recall: {recall}")
        # print(f"F1-score: {f1}")

    print("precision_scores", precision_scores)
    print("recall_scores", recall_scores)
    print("f1_scores", f1_scores)
    print("Model trained and ready to roll!")
    
    selected_skills = select_skills(data_with_jobs_classified, skill_cols)
    
    if selected_skills:
        input_vector = []
        for skill in skill_cols:
            if skill in selected_skills:
                input_vector.append(1)
            else:
                input_vector.append(0)
        
        # input_array = np.array(input_vector).reshape(1, -1)
        input_array = pd.DataFrame([input_vector], columns=skill_cols)
        predicted_label_encoded = grid_search.best_estimator_.predict(input_array)[0]
        predicted_label = label_encoder.inverse_transform([predicted_label_encoded])[0]
        print(f"Predicted category encoded: {predicted_label_encoded}")
        print(f"Predicted category: {predicted_label}")

        model_data = {
            "name": model_config["classifier"],
            "best_params": grid_search.best_params_,
            "best_estimator": grid_search.best_estimator_,
            "accuracy_score": accuracy,
            "precision_scores": precision_scores,
            "recall_scores": recall_scores,
            "f1_scores": f1_scores,
            "predicted_label": predicted_label,
            "predicted_label_encoded": predicted_label_encoded,
            "classification_scores": classification_scores_dict_df,
            "original_labels": original_labels
            # "original_labels": label_encoder.inverse_transform(set(y_train))
        }
        results.append(model_data)
    else:
        print("You don't write any skill. End program.")
    print("="*130)
    
print("="*80, "RESULTS", "="*80)
for model in results:
    print(f"""
    Model name:         {model["name"]}
    Accuracy:           {model["accuracy_score"]}
    Precision:          {model["precision_scores"]}
    F1:                 {model["f1_scores"]}
    Recall:             {model["recall_scores"]}
    Predicted label:    {model["predicted_label"]}
    Predicted label:    {model["predicted_label_encoded"]}
    Original labels:    {model["original_labels"]}
""")
    print("=" * 168)
     # Original labels:    {model["original_labels"]}
models_results_df = pd.DataFrame(data=results)
print(models_results_df)
type(models_results_df["recall_scores"].reset_index())




# Flask, Django, Docker, Cloud, AWS, API
# input_vector [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript     
# input_vector [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

# Pandas, Numpy, Matplotlib, Data, Analyzing, Data Storage
# Accuracy score 0.9920634920634921

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.svm import LinearSVC, NuSVC, SVC

results = []

# data_with_jobs_classified = pd.read_csv("dane_sklasyfikowane.csv")
# data_with_jobs_classified = copied_all_data_df.copy()
# data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"] != "Other"]

data_with_jobs_classified = copied_all_data_df.copy()
counts = data_with_jobs_classified["skills_classified_title"].value_counts()
print(counts)
values_to_keep = counts[counts > 1].index
data_with_jobs_classified = data_with_jobs_classified[data_with_jobs_classified["skills_classified_title"].isin(values_to_keep)]
print(values_to_keep)

skill_cols = data_with_jobs_classified.iloc[:, 9:]

if not isinstance(skill_cols, list):
    print(f"skill_cols is not a list of skill names! Zmiana z {type(skill_cols)} na listę!")
    skill_cols = list(skill_cols)  

X = data_with_jobs_classified[skill_cols]

def select_skills(data, available_skills):
    # Pandas, Numpy, Matplotlib, Data, Analyzing, Data Storage
    # selected_names = input("Please write skills, separated by commas (e.g.. Python,SQL,Excel): ")
    selected_names = "Pandas, Numpy, Matplotlib, Data, Analyzing, Data Storage"
    print("selected_names", selected_names)
    print("type(selected_names)", type(selected_names))
    selected_skills = [name.strip() for name in selected_names.split(",")]
    not_in_db_skills = [skill for skill in selected_skills if skill not in available_skills]
    return selected_skills

X = data_with_jobs_classified[skill_cols]
y = data_with_jobs_classified["skills_classified_title"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
# print("label_encoder", label_encoder.classes_)
original_labels = []
for i, label in enumerate(label_encoder.classes_):
    original_labels.append({label: i})

# print("original_labels", original_labels)

MODELS_MAP = {
    "LogisticRegression": LogisticRegression,
    "RandomForestClassifier": RandomForestClassifier,
    "KNeighborsClassifier": KNeighborsClassifier,
    "XGBClassifier": XGBClassifier,
    "XGBRFClassifier": XGBRFClassifier,
    "LinearSVC": LinearSVC,
    "SVC": SVC
}

models = [
   {
        "classifier": "LogisticRegression",
        "params": {
            "model__penalty": ["elasticnet"],
            "model__C": [0.1, 1, 0.3],
            "model__max_iter": np.arange(150, 300, 150),
            "model__n_jobs": [-1],
            "model__solver": ['saga'],
            "model__l1_ratio": np.arange(0.1, 1, 0.3),
        }
    },
    {
        "classifier": "RandomForestClassifier",
        "params": {
            'model__n_estimators': [100, 200, 100],
            # 'model__max_depth': [10, 20, None],
            'model__max_depth': [10, None],
            'model__min_samples_split': [2, 3],
            # 'model__min_samples_split': [2, 3, 4, 5],
            'model__min_samples_leaf': [1, 2, 4],
            'model__max_features': ['sqrt', 'log2']
        }
    },
    {
        "classifier": "KNeighborsClassifier",
        "params": {
            'model__n_neighbors': [3, 5, 10, 15],
            'model__weights': ['uniform', 'distance'],
            'model__metric': ['minkowski', 'euclidean'],
            'model__p': [1, 2],
            'model__algorithm': ['auto', 'ball_tree']
        }
    },
    {
        "classifier": "XGBClassifier",
        "params": {
            'model__n_estimators': [100, 200],
            'model__max_depth': [3, 5],
            'model__learning_rate': [0.05, 0.1],
            'model__subsample': [0.8, 1.0],
            'model__colsample_bytree': [0.8, 1.0],
            # 'model__reg_alpha': [0, 0.1, 1, 10],
            # 'model__reg_lambda': [0, 0.1, 1, 10]
            'model__reg_alpha': [0, 0.1],
            'model__reg_lambda': [0, 0.1]
        }
    },
    {
        "classifier": "XGBRFClassifier",
        "params": {
            'model__n_estimators': [200, 300],
            'model__max_depth': [5, 7, 10],
            'model__subsample': [0.8, 1.0],
            'model__colsample_bytree': [0.8, 1.0],
            'model__reg_alpha': [0, 0.1],
            'model__reg_lambda': [1, 10]
        }
    },
    {
        "classifier": "LinearSVC",
        "params": {
            'model__C': [0.01, 0.1, 1, 10],
            'model__loss': ['hinge', 'squared_hinge'],
            'model__penalty': ['l2'],
            'model__dual': [True],
            'model__max_iter': [1000, 5000]
        }
    },
    {
        "classifier": "SVC",
        "params": {
            'model__C': [0.1, 1, 10],
            'model__kernel': ['linear', 'rbf', 'poly'],
            'model__gamma': ['scale', 'auto', 0.1],
            'model__degree': [2, 3],
            'model__coef0': [0.0, 0.5]
        }
    },
]

stop_flag = 0
for model_config in models:
    # stop_flag = stop_flag + 1
    # if stop_flag == 2:
    #     break;
    
    pipeline = Pipeline(
        steps=[
            ("model", MODELS_MAP[model_config["classifier"]]())
        ]
    )
    
    grid_search = GridSearchCV(estimator=pipeline, param_grid=model_config["params"], n_jobs=-1, cv=2, verbose=0)
    grid_search.fit(X_train, y_train_encoded)
    y_pred = grid_search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    print("model_config['classifier']", model_config["classifier"])
    print("accuracy", accuracy)
    classification_scores_dict = classification_report(y_test_encoded, y_pred, output_dict=True)
    classification_scores_dict_df = pd.DataFrame(classification_scores_dict).transpose()
    # print("classification_scores_dict_df", classification_scores_dict_df)
    # print("report_df", report_df)
    classes = np.unique(y_test_encoded)
    precision_scores = []
    recall_scores = []
    f1_scores = []

    for c in classes:
        y_true_binary = (y_test_encoded == c).astype(int)
        y_pred_binary = (y_test_encoded == c).astype(int)
        
        precision = precision_score(y_true_binary, y_pred_binary)
        precision_scores.append(round(precision, 3))
        recall = recall_score(y_true_binary, y_pred_binary)
        recall_scores.append(round(recall, 3))
        f1 = f1_score(y_true_binary, y_pred_binary)
        f1_scores.append(round(f1, 3))

    print("Model trained and ready to roll!")
    
    selected_skills = select_skills(data_with_jobs_classified, skill_cols)
    
    if selected_skills:
        input_vector = []
        for skill in skill_cols:
            if skill in selected_skills:
                input_vector.append(1)
            else:
                input_vector.append(0)
        
        # input_array = np.array(input_vector).reshape(1, -1)
        input_array = pd.DataFrame([input_vector], columns=skill_cols)
        predicted_label_encoded = grid_search.best_estimator_.predict(input_array)[0]
        predicted_label = label_encoder.inverse_transform([predicted_label_encoded])[0]
        print(f"Predicted category encoded: {predicted_label_encoded}")
        print(f"Predicted category: {predicted_label}")

        model_data = {
            "name": model_config["classifier"],
            "best_params": grid_search.best_params_,
            "best_estimator": grid_search.best_estimator_,
            "accuracy_score": accuracy,
            "precision_scores": precision_scores,
            "recall_scores": recall_scores,
            "f1_scores": f1_scores,
            "predicted_label": predicted_label,
            "predicted_label_encoded": predicted_label_encoded,
            "classification_scores": classification_scores_dict_df,
            "original_labels": original_labels
        }
        results.append(model_data)
    else:
        print("You don't write any skill. End program.")
    print("="*130)
    
print("="*80, "RESULTS", "="*80)
for model in results:
    print(f"""
    Model name:         {model["name"]}
    Accuracy:           {model["accuracy_score"]}
    Precision:          {model["precision_scores"]}
    F1:                 {model["f1_scores"]}
    Recall:             {model["recall_scores"]}
    Predicted label:    {model["predicted_label"]}
    Predicted label:    {model["predicted_label_encoded"]}
    Original labels:    {model["original_labels"]}
""")
    print("=" * 168)

models_results_df = pd.DataFrame(data=results)
list_columns = ['precision_scores', 'recall_scores', 'f1_scores']

for col in list_columns:
    expanded_cols = pd.DataFrame(models_results_df[col].tolist())
    expanded_cols = expanded_cols.add_prefix(f'{col}_')
    models_results_df = pd.concat([models_results_df, expanded_cols], axis=1)
 
columns_to_show=['name', 'accuracy_score','predicted_label',
       'predicted_label_encoded',
       'precision_scores_0', 'precision_scores_1', 'precision_scores_2',
       'precision_scores_3', 'recall_scores_0', 'recall_scores_1',
       'recall_scores_2', 'recall_scores_3', 'f1_scores_0', 'f1_scores_1',
       'f1_scores_2', 'f1_scores_3']

models_results_df.columns
models_results_df[columns_to_show]
# Flask, Django, Docker, Cloud, AWS, API
# input_vector [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# AI, Docker, English, HTML5 / CSS3, Javascript, Node.js, Typescript     
# input_vector [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,

# Pandas, Numpy, Matplotlib, Data, Analyzing, Data Storage
# Accuracy score 0.9920634920634921

In [None]:
models_results_df[columns_to_show]